Hash Function¶

For the hash function, we'll use the cyclic hash code hc_cyclic with shift value $s = 5$.

This will be followed by MAD compression with $N = 11, a = 7, b = 3$.

def hc_cyclic(s, str):
    h = 0
    for c in str:
        h = (( h << s ) | ( h >> (32-s)))
        h = h ^ ord(c)
        # Mask h to always be 32 bits
        h &= 0xffffffff
    return h

def comp_mad(N, a, b, k):
    return abs(a*k + b) % N

def h(str):
    hc = hc_cyclic(5, str)
#    return comp_mad(1000037, 761861, 3, hc)
    return comp_mad(11, 7, 3, hc)

#N = 1000037
N = 11

Separate Chaining¶

Initially, the hash ht table is an $N$-long list of empty lists.

ht = []
for i in range(N):
    ht.append([])

print(ht)

[[], [], [], [], [], [], [], [], [], [], []]

Insert into ht by hashing the key to produce an index into ht and appending the (key, value) pair to the indexed list.

ht[h("marron")].append(("marron","CMSC341-04,CMSC341-07"))

ht[h("marron")]

[('marron', 'CMSC341-04,CMSC341-07')]

print(ht)

[[], [], [], [], [], [], [], [], [], [('marron', 'CMSC341-04,CMSC341-07')], []]

ht[h("marron")].append(("marron","CMSC341,CMSC441"))
print(ht)

[[], [], [], [], [], [], [], [], [], [('marron', 'CMSC341-04,CMSC341-07'), ('marron', 'CMSC341,CMSC441')], []]

The insert() function prevents duplicate keys from being entered. If a key is already in the hash table, its value is overwritten.

def insert(ht, h, pair):
    key = pair[0]
    val = pair[1]
    indx = h(key)
    if ht[indx] != []:
        for i in range(len(ht[indx])):
            if ht[indx][i][0] == key:
                ht[indx][i] = pair
                return
    ht[indx].append(pair)

insert(ht, h, ("johnson", "CMSC341,CMSC201"))

ht[h("johnson")]

[('johnson', 'CMSC341,CMSC201')]

print(ht)

[[], [], [], [('johnson', 'CMSC341,CMSC201')], [], [], [], [], [], [('marron', 'CMSC341-04,CMSC341-07'), ('marron', 'CMSC341,CMSC441')], []]

insert(ht, h, ("johnson", "CMSC202"))

ht[h("johnson")]

[('johnson', 'CMSC202')]

print(ht)

[[], [], [], [('johnson', 'CMSC202')], [], [], [], [], [], [('marron', 'CMSC341-04,CMSC341-07'), ('marron', 'CMSC341,CMSC441')], []]

insert(ht, h, ("chang", "CMSC441,CMSC641"))

print(ht)

[[], [], [], [('johnson', 'CMSC202')], [], [], [], [], [], [('marron', 'CMSC341-04,CMSC341-07'), ('marron', 'CMSC341,CMSC441'), ('chang', 'CMSC441,CMSC641')], []]

The remove() function returns True if the key value was found and the pair removed; it returns False if the key was not found.

def remove(ht, h, key):
    indx = h(key)
    if ht[indx] != []:
        for i in range(len(ht[indx])):
            if ht[indx][i][0] == key:
                ht[indx].remove(ht[indx][i])
                return True
    return False

remove(ht, h, "johnson")

True

ht[h("johnson")]

[]

remove(ht, h, "chang")

True

print(ht)

[[], [], [], [], [], [], [], [], [], [('marron', 'CMSC341-04,CMSC341-07'), ('marron', 'CMSC341,CMSC441')], []]

If the key is found, the search function returns True and the pair corresponding to the specified key; otherwise, it returns False and an empty tuple.

def search(ht, h, key):
    indx = h(key)
    if ht[indx] != []:
        for i in range(len(ht[indx])):
            if ht[indx][i][0] == key:
                return True, ht[indx][i]
    return False, ()

insert(ht,h,("johnson", "CMSC341,CMSC201"))

res, pair = search(ht, h, "johnson")
if res:
    print(pair)
else:
    print("not found")

('johnson', 'CMSC341,CMSC201')

res, pair = search(ht, h, "marron")
if res:
    print(pair)
else:
    print("not found")

('marron', 'CMSC341-04,CMSC341-07')

res, pair = search(ht, h, "chang")
if res:
    print(pair)
else:
    print("not found")

not found

insert(ht, h, ("chang", "CMSC441,CMSC641"))
print(ht)

[[], [], [], [('johnson', 'CMSC341,CMSC201')], [], [], [], [], [], [('marron', 'CMSC341-04,CMSC341-07'), ('marron', 'CMSC341,CMSC441'), ('chang', 'CMSC441,CMSC641')], []]

h("marron")

9

h("chang")

9

Linear Probing¶

To do linear probing, we create the table as a list of empty tuples. We also need to define a vacant tuple to represent a location in the table where there was data but which has since been removed. Removing can create gaps in the data, but we need to search past these gaps. When we see a vacant entry, we keep searching; when we see an empty entry, (), we stop searching.

htl = []
for i in range(N):
    htl.append(())
vacant = ("","")  # vacant is *different* from uninitialized

print(htl)

[(), (), (), (), (), (), (), (), (), (), ()]

def insertl(htl, h, pair):
    key = pair[0]
    indx = h(key)
    foundVacant = False
    vacantIndx = 0
    while htl[indx] != ():
        if htl[indx][0] == key:
            htl[indx] = pair
            return
        elif ( not foundVacant ) and ( htl[indx] == vacant ):
            foundVacant = True
            vacantIndx = indx
        indx = (indx + 1) % N
    if foundVacant:
        htl[vacantIndx] = pair
    else:
        htl[indx] = pair

insertl(htl, h, ("chang", "CMSC441,CMSC641"))

print(htl)

[(), (), (), (), (), (), (), (), (), ('chang', 'CMSC441,CMSC641'), ()]

htl[h("chang")]

('chang', 'CMSC441,CMSC641')

insertl(htl, h, ("marron", "CMSC341,CMSC441"))

print(htl)

[(), (), (), (), (), (), (), (), (), ('chang', 'CMSC441,CMSC641'), ('marron', 'CMSC341,CMSC441')]

def removel(htl, h, key):
    indx = h(key)
    while htl[indx] != ():
        if htl[indx][0] == key:
            htl[indx] = vacant
            return True
        indx = (indx + 1) % N
    return False

insertl(htl, h, ("marron", "CMSC341") )

removel(htl, h, "chang")

True

print(htl)

[(), (), (), (), (), (), (), (), (), ('', ''), ('marron', 'CMSC341')]

def searchl(htl, h, key):
    indx = h(key)
    while htl[indx] != ():
        if htl[indx][0] == key:
            return True, htl[indx]
        indx = (indx + 1) % N
    return False, ()

searchl(htl, h, "chang")

(False, ())

searchl(htl, h, "marron")

(True, ('marron', 'CMSC341'))

searchl(htl, h, "johnson")

(False, ())

insertl(htl, h, ("johnson","CMSC341,CMSC201"))

searchl(htl, h, "johnson")

(True, ('johnson', 'CMSC341,CMSC201'))

print(htl)

[(), (), (), ('johnson', 'CMSC341,CMSC201'), (), (), (), (), (), ('', ''), ('marron', 'CMSC341')]

insertl(htl, h, ("chang","CMSC441,CMSC641"))

print(htl)

[(), (), (), ('johnson', 'CMSC341,CMSC201'), (), (), (), (), (), ('chang', 'CMSC441,CMSC641'), ('marron', 'CMSC341')]

removel(htl,h,"marron")

True

print(htl)

[(), (), (), ('johnson', 'CMSC341,CMSC201'), (), (), (), (), (), ('chang', 'CMSC441,CMSC641'), ('', '')]

insertl(htl, h, ("marron", "CMSC341,CMSC441"))

print(htl)

[(), (), (), ('johnson', 'CMSC341,CMSC201'), (), (), (), (), (), ('chang', 'CMSC441,CMSC641'), ('marron', 'CMSC341,CMSC441')]

Quadratic Probing¶

In Quadratic Probing, we probe at positions $A[(h + j^2)\mod N] \mbox{ for } j=0, 1, 2, \ldots$. This has the effect of reducing clustering in the table by spreading out the collisions. However, there are problems with quadratic probing. It is possible for the table to be nearly half empty and yet quadratic probing may not find an open location.

Example¶

Suppose $N = 11 \mbox{ and } h = 2$.

The values $(h + j^2) \mod N$ do not include 1, 4, 8, or 10. If these were the only open locations in the table, quadratic probing would fail to find one.

This phenomena is related to the important mathematical concept of quadratic residues.

Computational Example¶

htq = []
for i in range(N):
    htq.append(())
vacant = ("","")  # vacant is *different* from uninitialized

print(htq)

[(), (), (), (), (), (), (), (), (), (), ()]

def insertq(htq, h, pair):
    key = pair[0]
    indx = h(key)
    indx0 = indx
    i = 1
    while htq[indx] != ():
        if ( htq[indx][0] == key ) or ( htq[indx] == vacant ):
            break
        else:
            indx = (indx0 + i*i) % N
            i += 1
    htq[indx] = pair

insertq(htq, h, ("hello", "english"))

insertq(htq, h, ("ciao", "italian"))

insertq(htq, h, ("bonjour", "french"))

insertq(htq, h, ("bienvenidos", "spanish"))

insertq(htq, h, ("konnichiwa", "japanese"))

insertq(htq, h, ("hallo", "icelandic"))

insertq(htq, h, ("szia", "hungarian"))

insertq(htq, h, ("czesc", "polish"))

insertq(htq, h, ("ola", "portugese"))
print(htq)

[('czesc', 'polish'), ('hello', 'english'), ('bienvenidos', 'spanish'), ('konnichiwa', 'japanese'), (), ('ola', 'portugese'), ('szia', 'hungarian'), ('hallo', 'icelandic'), (), ('ciao', 'italian'), ('bonjour', 'french')]

The following insertion will cause in infinite loop, even though there are still empty buckets.

h("dobry dzien")

2

insertq(htq, h, ("dobry dzien", "belarusian"))
print(htq)

Load Factors¶

Let $n \mbox{ be the number of elements in a hash table of size } N$.

The load factor is $\lambda = \frac{n}{N}$. It must always be less than one.

Separate Chaining. We should have $\lambda < 0.9$.

The average number of probes for a successful search is $$\frac{\lambda + 1}{2}.$$

def cprobes(lam):
    return (lam +1)/2

cprobes(0.9)

0.95

Linear Probing. For linear probing, we want $\lambda < 0.5$.

The expected number of probes is $$ \frac{1 + \frac{1}{(1-\lambda)^2}}{2}$$ so $\lambda < 0.5$ gives fewer than 2.5 expected probes.

def lprobes(lam):
    x = 1 / ((1-lam)*(1-lam))
    return (1 + x)/2

lprobes(0.9)

50.50000000000003

lprobes(0.5)

2.5

lprobes(.2)

1.28125

lprobes(.6)

3.6249999999999996

lprobes(.8)

13.000000000000007

lprobes(.95)

200.49999999999966

Quadratic Probing. $\lambda < 0.5$ is still appropriate. This guarantees we will not have the problem demonstrated above (failure to find an open bucket, even though there are some).