123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251 |
- (*
- * Copyright (c) 2003 Dustin Sallings <dustin@spy.net>
- * Copyright (C) The #Seppo contributors. All rights reserved.
- *
- * based on
- * https://github.com/dustin/snippets/blob/master/ocaml/lib/cdb.ml
- *)
- (**
- * CDB Implementation. http://cr.yp.to/cdb/cdb.txt
- *)
- (* The cdb hash function is ``h = ((h << 5) + h) ^ c'', with a starting
- hash of 5381.
- *)
- type cdb_creator = {
- table_count : int array;
- (* Hash index pointers *)
- mutable pointers : (int32 * int32) list;
- out : out_channel;
- }
- let hash_init : int64 = 5381L
- let ff64 : int64 = 0xffL
- let ffffffff64 : int64 = 0xFFffFFffL
- let ff32 : int32 = Int32.of_int 0xff
- let hash (s : bytes) : int32 =
- let h = ref hash_init in
- Bytes.iter
- (fun c ->
- h :=
- Int64.logand ffffffff64
- (Int64.logxor
- (Int64.add (Int64.shift_left !h 5) !h)
- (Int64.of_int (int_of_char c))))
- s;
- Int64.to_int32 !h
- let wri4b_le oc (byt : int -> int) =
- let wri idx = byt idx |> output_byte oc in
- wri 0;
- wri 1;
- wri 2;
- wri 3
- let write_le cdc (i32 : int) =
- wri4b_le cdc.out (fun byt -> (i32 lsr (byt * 8)) land 0xff)
- let write_le32 cdc (i32 : int32) =
- wri4b_le cdc.out (fun byt ->
- Int32.to_int (Int32.logand (Int32.shift_right_logical i32 (byt * 8)) ff32))
- let cdb_creator_of_out_channel out_channel : cdb_creator =
- let cdb =
- { table_count = Array.make 256 0; pointers = []; out = out_channel }
- in
- (* Skip over the header *)
- seek_out cdb.out 2048;
- cdb
- let open_out (fn : string) : cdb_creator =
- fn |> open_out_bin |> cdb_creator_of_out_channel
- let hash_to_table h = Int32.to_int (Int32.logand h ff32)
- let hash_to_bucket h len =
- Int32.rem (Int32.shift_right_logical h 8) (Int32.of_int len) |> Int32.to_int
- let pos_out_32 x = x |> LargeFile.pos_out |> Int64.to_int32
- let add cdc k v =
- (* Add the hash to the list *)
- let h = hash k in
- cdc.pointers <- (h, pos_out_32 cdc.out) :: cdc.pointers;
- let table = hash_to_table h in
- cdc.table_count.(table) <- cdc.table_count.(table) + 1;
- (* Add the data to the file *)
- write_le cdc (Bytes.length k);
- write_le cdc (Bytes.length v);
- output_bytes cdc.out k;
- output_bytes cdc.out v
- (** Process a hash table *)
- let process_table cdc table_start slot_table slot_pointers i tc =
- (* Length of the table *)
- let len = tc * 2 in
- (* Store the table position *)
- slot_table := (pos_out_32 cdc.out, Int32.of_int len) :: !slot_table;
- (* Build the hash table *)
- let ht = Array.make len None in
- let cur_p = ref table_start.(i) in
- let lookupSlot n =
- try Hashtbl.find slot_pointers n with Not_found -> (Int32.zero, Int32.zero)
- in
- for _ = 0 to tc - 1 do
- let hp = lookupSlot !cur_p in
- cur_p := !cur_p + 1;
- (* Find an available hash bucket *)
- let rec find_slot where =
- match ht.(where) with
- | None -> where
- | Some _ -> if where + 1 = len then find_slot 0 else find_slot (where + 1)
- in
- let where = find_slot (hash_to_bucket (fst hp) len) in
- ht.(where) <- Some hp
- done;
- (* Write this hash table *)
- Array.iter
- (fun hpp ->
- let h, t =
- match hpp with None -> (Int32.zero, Int32.zero) | Some x -> x
- in
- write_le32 cdc h;
- write_le32 cdc t)
- ht
- let close_cdb_out cdc =
- let cur_entry = ref 0 in
- let table_start = Array.make 256 0 in
- (* Find all the hash starts *)
- Array.iteri
- (fun i x ->
- cur_entry := !cur_entry + x;
- table_start.(i) <- !cur_entry)
- cdc.table_count;
- (* Build out the slot pointers hash *)
- let slot_pointers = Hashtbl.create (List.length cdc.pointers) in
- (* Fill in the slot pointers *)
- List.iter
- (fun hp ->
- let h = fst hp in
- let table = hash_to_table h in
- table_start.(table) <- table_start.(table) - 1;
- Hashtbl.replace slot_pointers table_start.(table) hp)
- cdc.pointers;
- (* Write the shit out *)
- let slot_table = ref [] in
- (* Write out the hash tables *)
- Array.iteri
- (process_table cdc table_start slot_table slot_pointers)
- cdc.table_count;
- (* write out the pointer sets *)
- seek_out cdc.out 0;
- List.iter
- (fun x ->
- write_le32 cdc (fst x);
- write_le32 cdc (snd x))
- (List.rev !slot_table);
- close_out cdc.out
- (** {1 Iterating a cdb file} *)
- (* read a little-endian integer *)
- let read_le f =
- let a = input_byte f in
- let b = input_byte f in
- let c = input_byte f in
- let d = input_byte f in
- a lor (b lsl 8) lor (c lsl 16) lor (d lsl 24)
- (* Int32 version of read_le *)
- let read_le32 f =
- let a = input_byte f in
- let b = input_byte f in
- let c = input_byte f in
- let d = input_byte f in
- Int32.logor
- (Int32.of_int (a lor (b lsl 8) lor (c lsl 16)))
- (Int32.shift_left (Int32.of_int d) 24)
- let iter (f : bytes * bytes -> bool) (fn : string) : unit =
- let fin = open_in_bin fn in
- try
- (* Figure out where the end of all data is *)
- let eod = read_le32 fin in
- (* Seek to the record section *)
- seek_in fin 2048;
- let rec loop () =
- (* (pos_in fin) < eod *)
- if Int32.compare (Int64.to_int32 (LargeFile.pos_in fin)) eod < 0 then (
- let klen = read_le fin in
- let dlen = read_le fin in
- let key = Bytes.create klen in
- let data = Bytes.create dlen in
- really_input fin key 0 klen;
- really_input fin data 0 dlen;
- if f (key, data) then loop ())
- in
- loop ();
- close_in fin
- with x ->
- close_in fin;
- raise x
- type cdb_file = {
- f : in_channel;
- (* Position * length *)
- tables : (int32 * int) array;
- }
- let open_cdb_in (fn : string) : cdb_file =
- let fin = open_in_bin fn in
- let tables = Array.make 256 (Int32.zero, 0) in
- (* Set the positions and lengths *)
- Array.iteri
- (fun i _ ->
- let pos = read_le32 fin in
- let len = read_le fin in
- tables.(i) <- (pos, len))
- tables;
- { f = fin; tables }
- let close_cdb_in cdf = close_in cdf.f
- let find_all (cdf : cdb_file) (key : bytes) : bytes Stream.t =
- let kh = key |> hash in
- (* Find out where the hash table is *)
- let hpos, hlen = cdf.tables.(hash_to_table kh) and fd = cdf.f in
- let rec loop x =
- if x >= hlen then None
- else
- (* Calculate the slot containing these entries *)
- let lslot = (hash_to_bucket kh hlen + x) mod hlen in
- let spos = Int32.add (Int32.of_int (lslot * 8)) hpos in
- LargeFile.seek_in fd (Int64.of_int32 spos);
- let h = read_le32 fd in
- let pos = read_le32 fd in
- (* validate that we a real bucket *)
- if h = kh && Int32.compare pos Int32.zero > 0 then (
- LargeFile.seek_in fd (Int64.of_int32 pos);
- let klen = read_le fd in
- if klen = Bytes.length key then (
- let dlen = read_le fd in
- let rkey = Bytes.create klen in
- really_input fd rkey 0 klen;
- if rkey = key then (
- let rdata = Bytes.create dlen in
- really_input fd rdata 0 dlen;
- Some rdata)
- else loop (x + 1))
- else loop (x + 1))
- else loop (x + 1)
- in
- Stream.from loop
- let find_first cdf key =
- try Some (key |> find_all cdf |> Stream.next) with Stream.Failure -> None
|