aut-csc
/
hash-tables


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
							% Copyright (C) 2017 Koz Ross <koz.ross@retro-freedom.nz>

% This work is licensed under the Creative Commons Attribution-ShareAlike 4.0 
% International License. To view a copy of this license, visit 
% http://creativecommons.org/licenses/by-sa/4.0/ or send a letter to Creative 
% Commons, PO Box 1866, Mountain View, CA 94042, USA.

\documentclass{beamer}

\usepackage[utf8]{inputenc}
\usecolortheme{seahorse}
\usefonttheme{professionalfonts}
\usepackage{fontspec}
\setmainfont{DejaVu Sans}
\setbeamertemplate{navigation symbols}{}
\usepackage{caption}
\usepackage[font=itshape, begintext=``, endtext='']{quoting}
\usepackage{fancyvrb}
\usepackage{color}
\usepackage{booktabs}
\usepackage[noend]{algpseudocode}

\title{Hash tables}
\subtitle{Or: why maths {\em matters\/}}
\titlegraphic{\includegraphics[scale=0.8]{img/logo.pdf}}
\author{Yuan Yuan}
\date{20th July, 2017}

\begin{document}

\begin{frame}[c]
  \titlepage{}
\end{frame}

\begin{frame}[c]
  \frametitle{Outline}
  \tableofcontents
\end{frame}

\section{The dictionary problem}

\begin{frame}[c]
  \frametitle{The dictionary problem}

  \begin{definition}
    An {\em entry\/} is a pair of {\em key\/} and {\em value}.  
  \end{definition}\pause{}

  \begin{definition}
    A {\em dictionary\/} is a set of entries, such that no two entries have the
    same key.
  \end{definition}\pause{}

  \begin{definition}
    The {\em dictionary problem\/} requires us to maintain a dictionary $D$, 
    with the following operations:\pause{}

    \begin{description}[$\mathrm{put}(D,k,v)$:]
      \item[$\mathrm{len}(D)$:] Return the number of entries in $D$\pause{} 
      \item[$\mathrm{put}(D, k, v)$:] Add a new entry to $D$ with key $k$ and
        value $v$; if an entry with key $k$ already exists, replace its value
        with $v$\pause{}
      \item[$\mathrm{get}(D, k)$:] Return the value of the entry whose key is
        $k$, or $\mathrm{null}$ if no such entry exists
    \end{description}
  \end{definition}
\end{frame}

\begin{frame}[c]
  \frametitle{Why we care about the dictionary problem}
  \pause{}
  \begin{itemize}
    \item Databases (keys are identifiers, values are data)\pause{}
    \item A solution to this problem can implement almost any other data
      structure:\pause{}
      \begin{itemize}
        \item Arrays and lists (keys are consecutive integers)\pause{}
        \item Sets (keys and values are the same as each other)\pause{}
        \item Trees, graphs, etc\pause{}
        \item At least one programming language (Lua) does exactly this!\pause{}
      \end{itemize}
    \item Allows us to give {\em names\/} to data, and use those names instead
      of the data itself (c.f.\ any programming language ever)\pause{}
  \end{itemize}
  
  \medskip{}

  In short, {\em we want good solutions to the dictionary problem!\/}\pause{} We
  also don't want to impose too many constraints on the data beyond equality
  being defined (so ordering shouldn't matter, for example).
\end{frame}

\begin{frame}[c]
  \frametitle{First attempt: Array or list}
  
  We can store our entries in an array or list.\pause{} We can remember the
  number of elements we store, and modify it on all operations in constant time,
  which makes $\mathrm{len}$ a $O(1)$ operation.\pause{}

  \medskip

  For $\mathrm{put}$, we scan the array or list left-to-right, comparing each
  entry's key with $k$. If we get a match, replace that entry's value with $v$;
  otherwise, add a new entry with key $k$ and value $v$ at the end.\pause{} This
  is $O(n)$, because we might have to scan the entire array or list.\pause{}

  \medskip

  We can do $\mathrm{get}$ similarly, except that we return the value if we find
  a match, or $\mathrm{null}$ otherwise.\pause{} This is also $O(n)$.\pause{}

  \bigskip

  \alert{Verdict:} Not very good at all.
\end{frame}

\begin{frame}[c]
  \frametitle{Can we do better?}
  
  Ideally, we want the following:\pause{}

  \begin{itemize}
    \item $\mathrm{get}$ and $\mathrm{put}$ to be {\em asymptotically\/} fast
      (as close to $O(1)$ as possible)\pause{}
    \item All operations to be {\em practically\/} fast (i.e.\ no asymptotic
      `abuse')\pause{}
    \item A data structure which is as simple as possible (because programming
      is hard enough already)\pause{}
  \end{itemize}

  \medskip

  There {\em is\/} a structure which achieves all of the above.\pause{} First,
  we need to do a bit of preparation\ldots
\end{frame}

\section{Hash functions}

\begin{frame}[c]
  \frametitle{Preliminaries}

  Let $\mathbb{N} = \{0, 1, 2, \ldots\}$ be the set of {\em natural
    numbers}. We use $\mathbb{N}_{k}$ to represent the set $\{x \in \mathbb{N}
  \mid x \text{ can be represented using } k \text{ bits }\}$, for $k \in
  \mathbb{N}$.\pause{} For example, $\mathbb{N}_{0} = \{\}$ (no number can be
  represented in $0$ bits) and $\mathbb{N}_{1} = \{0, 1\}$.\pause{}

  \medskip

  Let $A,B$ be sets. A {\em function\/} $f: A \rightarrow B$ is a set of pairs
  such that for any $(x,y) \in f$, $x \in A, y \in B$, and for every $x \in 
  A$, there exists exactly one $y \in B$ such that $(x,y) \in f$.\pause{} In
  this case, $A$ is the {\em domain\/} of $f$ and $B$ is the {\em codomain\/} of
  $f$.\pause{} For any $x \in A$, we denote by $f(x)$ (the {\em value of $f$
  at $x$\/}) the $y$ such that $(x,y) \in f$.\pause{}

  \medskip

  An example function $f: \{a, b, c\} \rightarrow \{1, 2, 3\}$ is $\{(a,
    1), (b, 1), (c, 2)\}$.\pause{} In this case, the domain of $f$ is $\{a, b,
  c\}$, the codomain of $f$ is $\{1, 2, 3\}$ and $f(a) = 1$.
\end{frame}

\begin{frame}[c]
  \frametitle{More about functions}

  Let $f: A \rightarrow B$ be a function. We say $f$ is {\em one-to-one\/} if,
  for any $x, y \in A$, if $x \neq y$, then $f(x) \neq f(y)$.\pause{} An example 
  one-to-one function $f: \{a, b, c\} \rightarrow \{1, 2, 3\}$ would be 
  $\{(a, 1), (b, 2), (c, 3)\}$.\pause{}

  \medskip

  \begin{lemma}
    Let $A$ be an infinite set and $B$ be a finite set. There is no function $f:
    A \rightarrow B$ such that $f$ is one-to-one.
  \end{lemma}\pause{}

  \medskip

  \begin{definition}
    Let $A$ be a set, and let $k \in \mathbb{N}$. A {\em hash function for $A$\/} 
    is some $f: A \rightarrow \mathbb{N}_{k}$. We call the value of $f(x)$ the
    {\em hash\/} of $x$.
  \end{definition}\pause{}
  
  \medskip

  You can think of a hash function as producing a {\em fixed-length summary\/}
  of its input.
\end{frame}

\section{The hash table}

\begin{frame}[c]
  \frametitle{The hash table}
  
  Let $K$ be a set of keys, $V$ be a set of values, and $k \in \mathbb{N}$.\pause{} 
  A {\em hash table\/} $H$ for $K,V$ consists of:\pause{}

  \begin{itemize}
    \item A hash function $H.\mathrm{hash}: K \rightarrow \mathbb{N}_{k}$\pause{}
    \item An array $H.\mathrm{buckets}$ of {\em buckets}, capable of storing
      elements of $V$. Additionally, $\mathrm{len}(H.\mathrm{buckets}) \leq 
      2^{k}$, initially full of $\mathrm{null}$s.\pause{}
  \end{itemize}

  \medskip

  As $H.\mathrm{buckets}$ is an array, we store and update its length similarly
  to our original solution, giving us $\mathrm{len}$ in $O(1)$ time.\pause{}
\end{frame}

\begin{frame}[c, fragile]
  \frametitle{$\mathrm{put}$ for a hash table}
  
  As $\mathrm{hash}$ produces a number, we can convert the hash of any key $x$ into
  a valid index for $\mathrm{buckets}$ by taking $\mathrm{hash}(x)$ modulo
  $\mathrm{len}(\mathrm{buckets})$.\pause{} If there is nothing at that index,
  we simply store our value there; otherwise, we replace the value there with
  the one we are given.\pause{}
  
  \medskip

  \begin{algorithmic}
    \Function{$\mathrm{put}$}{$H, k, v$}
      \State{}$i \gets{}$ \Call{$H.\mathrm{hash}$}{$k$} $\mathrm{\%}$
      \Call{$\mathrm{len}$}{$H.\mathrm{buckets}$}
      \If{$H.\mathrm{buckets}[i] = \mathrm{null}$}
        \State{}\Call{$\mathrm{len}$}{$H$} $\gets$ \Call{$\mathrm{len}$}{$H$} $+
        1$
      \EndIf{}
      \State{}$H.\mathrm{buckets}[i] \gets{} v$
    \EndFunction{}
  \end{algorithmic}\pause{}

  \medskip

  This only requires a constant amount of time, plus however long it takes to
  call $\mathrm{hash}$.

\end{frame}

\begin{frame}[c, fragile]
  \frametitle{$\mathrm{get}$ for a hash table}

  This is very similar to $\mathrm{put}$.\pause{}

  \medskip

  \begin{algorithmic}
    \Function{$\mathrm{get}$}{$H, k$}
    \State{}$i \gets{}$ \Call{$H.\mathrm{hash}$}{$k$} $\mathrm{\%}$
      \Call{$\mathrm{len}$}{$H.\mathrm{buckets}$}
      \State{}\Return{}$H.\mathrm{buckets}[i]$
    \EndFunction{}
  \end{algorithmic}\pause{}

  \medskip

  This also requires a constant amount of time, plus the call time of
  $\mathrm{hash}$.\pause{}

  \medskip

  This looks great!\pause{} However, this is too simplistic, and won't work in
  practice.
\end{frame}

\begin{frame}[c]
  \frametitle{Problems with our hash table}

  Our design assumes two things:\pause{}
  
  \begin{itemize}
    \item We will never try to $\mathrm{put}$ more entries into $H$
      than $\mathrm{len}(H.\mathrm{buckets})$\pause{}
    \item Given two different keys, $H.\mathrm{hash}$ will produce two different
      hashes\pause{}
  \end{itemize}

  \medskip

  Both of these are false in general:\pause{} the former obviously so, the latter
  because of our prior lemma, and the fact that most interesting sets of keys
  (e.g.\ all strings) are infinite.\pause{} Thus, what will inevitably happen at
  some point is that our $\mathrm{put}$ procedure will assign the same index to
  two different keys.\pause{} This is called a {\em collision}, and it really
  ruins our day (and design).\pause{}

  \medskip

  Collisions are inevitable --- based on this, we have to design with them in
  mind.\pause{} Luckily, our design is easy to fix to take collisions into
  account.
\end{frame}

\begin{frame}[c]
  \frametitle{Hash chaining}

  Instead of $\mathrm{buckets}$ storing values, each index stores a linked list
  of entries, which start out empty (a {\em bucket list\/}).\pause{} After we calculate an index, we
  work with the list at that position (just like with our first attempt), for both
  of our operations.\pause{}

  \medskip

  As long as our bucket lists fill up roughly evenly, the time required for
  $\mathrm{get}$ and $\mathrm{put}$ will be roughly
  $O(\frac{n}{m})$, where $m$ is the size of our bucket array.\pause{} This is 
  pretty good in practice, as long as we don't try to crowd too many entries into 
  our hash table.\pause{} 
  
  \medskip

  How can we be sure that our bucket lists will fill evenly?\pause{} A function
  which hashes {\em everything\/} to $1$ is a hash function, and
  most certainly {\em won't\/} cause our bucket lists to fill evenly!
\end{frame}

\begin{frame}[c]
  \frametitle{{\em Good\/} hash functions}
  \pause{}

  In order to make sure our buckets fill evenly, we need more from our hash
  functions.\pause{} Specifically, for a hash function $f: A \rightarrow
  \mathbb{N}_{k}$, we would like $f$ to have one (or both!) of the following 
  properties:\pause{}

  \medskip

  \begin{definition}
    $f$ is {\em uniform\/} if, given a random $x \in A$ and any specific $y \in
    \mathbb{N}_{k}$, there is a $\frac{1}{2^k}$ probability that $f(x) = y$.
  \end{definition}\pause{}

  \begin{definition}
    $f$ {\em exhibits the avalanche effect\/} if, for any $x,y \in A$ which differ
    by $1$ bit, $f(x), f(y)$ will differ by $\frac{k}{2}$ bits. 
  \end{definition}\pause{}

  \medskip

  These guarantee that, when the number of entries gets large, there is a high
  probability that they will be distributed evenly over all bucket lists.
\end{frame}

\begin{frame}[c]
  \frametitle{Limitations of hash tables}

  \pause{}
  \begin{itemize}
    \item For small hash tables, the cost of hashing overwhelms everything else
      (making them slower than array or list-based approaches)\pause{}
    \item Finding a good hash function can be difficult, especially for
      variable-length keys (but easier now with
      cryptographic hashing and universal hash functions)\pause{}
    \item No order for entries (or rather, no {\em specific\/} order)\pause{}
    \item Cache-unfriendly due to bucket lists (but alternative approaches exist
      which don't require them)\pause{}
  \end{itemize}

  \medskip

  Overall, hash tables work best for dictionaries with large numbers of entries
  and simple fixed-length keys. This is not necessarily a problem, as a lot of
  real data fits these requirements.\pause{} However, as always, {\em know your
  data and your tradeoffs\/}!


\end{frame}

\section{Questions}

\begin{frame}[fragile, c]
  \frametitle{Questions?}
  \begin{center}
  \includegraphics[scale=0.27]{img/questions.pdf}
\end{center}
\end{frame}

\end{document}