ds_cdb.ml 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. (*
  2. * Copyright (c) 2003 Dustin Sallings <dustin@spy.net>
  3. * Copyright (C) The #Seppo contributors. All rights reserved.
  4. *
  5. * based on
  6. * https://github.com/dustin/snippets/blob/master/ocaml/lib/cdb.ml
  7. *)
  8. (**
  9. * CDB Implementation. http://cr.yp.to/cdb/cdb.txt
  10. *)
  11. (* The cdb hash function is ``h = ((h << 5) + h) ^ c'', with a starting
  12. hash of 5381.
  13. *)
  14. type cdb_creator = {
  15. table_count : int array;
  16. (* Hash index pointers *)
  17. mutable pointers : (int32 * int32) list;
  18. out : out_channel;
  19. }
  20. let hash_init : int64 = 5381L
  21. let ff64 : int64 = 0xffL
  22. let ffffffff64 : int64 = 0xFFffFFffL
  23. let ff32 : int32 = Int32.of_int 0xff
  24. let hash (s : bytes) : int32 =
  25. let h = ref hash_init in
  26. Bytes.iter
  27. (fun c ->
  28. h :=
  29. Int64.logand ffffffff64
  30. (Int64.logxor
  31. (Int64.add (Int64.shift_left !h 5) !h)
  32. (Int64.of_int (int_of_char c))))
  33. s;
  34. Int64.to_int32 !h
  35. let wri4b_le oc (byt : int -> int) =
  36. let wri idx = byt idx |> output_byte oc in
  37. wri 0;
  38. wri 1;
  39. wri 2;
  40. wri 3
  41. let write_le cdc (i32 : int) =
  42. wri4b_le cdc.out (fun byt -> (i32 lsr (byt * 8)) land 0xff)
  43. let write_le32 cdc (i32 : int32) =
  44. wri4b_le cdc.out (fun byt ->
  45. Int32.to_int (Int32.logand (Int32.shift_right_logical i32 (byt * 8)) ff32))
  46. let cdb_creator_of_out_channel out_channel : cdb_creator =
  47. let cdb =
  48. { table_count = Array.make 256 0; pointers = []; out = out_channel }
  49. in
  50. (* Skip over the header *)
  51. seek_out cdb.out 2048;
  52. cdb
  53. let open_out (fn : string) : cdb_creator =
  54. fn |> open_out_bin |> cdb_creator_of_out_channel
  55. let hash_to_table h = Int32.to_int (Int32.logand h ff32)
  56. let hash_to_bucket h len =
  57. Int32.rem (Int32.shift_right_logical h 8) (Int32.of_int len) |> Int32.to_int
  58. let pos_out_32 x = x |> LargeFile.pos_out |> Int64.to_int32
  59. let add cdc k v =
  60. (* Add the hash to the list *)
  61. let h = hash k in
  62. cdc.pointers <- (h, pos_out_32 cdc.out) :: cdc.pointers;
  63. let table = hash_to_table h in
  64. cdc.table_count.(table) <- cdc.table_count.(table) + 1;
  65. (* Add the data to the file *)
  66. write_le cdc (Bytes.length k);
  67. write_le cdc (Bytes.length v);
  68. output_bytes cdc.out k;
  69. output_bytes cdc.out v
  70. (** Process a hash table *)
  71. let process_table cdc table_start slot_table slot_pointers i tc =
  72. (* Length of the table *)
  73. let len = tc * 2 in
  74. (* Store the table position *)
  75. slot_table := (pos_out_32 cdc.out, Int32.of_int len) :: !slot_table;
  76. (* Build the hash table *)
  77. let ht = Array.make len None in
  78. let cur_p = ref table_start.(i) in
  79. let lookupSlot n =
  80. try Hashtbl.find slot_pointers n with Not_found -> (Int32.zero, Int32.zero)
  81. in
  82. for _ = 0 to pred tc do
  83. let hp = lookupSlot !cur_p in
  84. cur_p := !cur_p + 1;
  85. (* Find an available hash bucket *)
  86. let rec find_slot where =
  87. match ht.(where) with
  88. | None -> where
  89. | Some _ -> if where + 1 = len then find_slot 0 else find_slot (where + 1)
  90. in
  91. let where = find_slot (hash_to_bucket (fst hp) len) in
  92. ht.(where) <- Some hp
  93. done;
  94. (* Write this hash table *)
  95. Array.iter
  96. (fun hpp ->
  97. let h, t =
  98. match hpp with None -> (Int32.zero, Int32.zero) | Some x -> x
  99. in
  100. write_le32 cdc h;
  101. write_le32 cdc t)
  102. ht
  103. let close_cdb_out cdc =
  104. let cur_entry = ref 0 in
  105. let table_start = Array.make 256 0 in
  106. (* Find all the hash starts *)
  107. Array.iteri
  108. (fun i x ->
  109. cur_entry := !cur_entry + x;
  110. table_start.(i) <- !cur_entry)
  111. cdc.table_count;
  112. (* Build out the slot pointers hash *)
  113. let slot_pointers = Hashtbl.create (List.length cdc.pointers) in
  114. (* Fill in the slot pointers *)
  115. List.iter
  116. (fun hp ->
  117. let h = fst hp in
  118. let table = hash_to_table h in
  119. table_start.(table) <- pred table_start.(table);
  120. Hashtbl.replace slot_pointers table_start.(table) hp)
  121. cdc.pointers;
  122. (* Write the shit out *)
  123. let slot_table = ref [] in
  124. (* Write out the hash tables *)
  125. Array.iteri
  126. (process_table cdc table_start slot_table slot_pointers)
  127. cdc.table_count;
  128. (* write out the pointer sets *)
  129. seek_out cdc.out 0;
  130. List.iter
  131. (fun x ->
  132. write_le32 cdc (fst x);
  133. write_le32 cdc (snd x))
  134. (List.rev !slot_table);
  135. close_out cdc.out
  136. (** {1 Iterating a cdb file} *)
  137. (* read a little-endian integer *)
  138. let read_le f =
  139. let a = input_byte f in
  140. let b = input_byte f in
  141. let c = input_byte f in
  142. let d = input_byte f in
  143. a lor (b lsl 8) lor (c lsl 16) lor (d lsl 24)
  144. (* Int32 version of read_le *)
  145. let read_le32 f =
  146. let a = input_byte f in
  147. let b = input_byte f in
  148. let c = input_byte f in
  149. let d = input_byte f in
  150. Int32.logor
  151. (Int32.of_int (a lor (b lsl 8) lor (c lsl 16)))
  152. (Int32.shift_left (Int32.of_int d) 24)
  153. let iter (f : bytes * bytes -> bool) (fn : string) : unit =
  154. let fin = open_in_bin fn in
  155. try
  156. (* Figure out where the end of all data is *)
  157. let eod = read_le32 fin in
  158. (* Seek to the record section *)
  159. seek_in fin 2048;
  160. let rec loop () =
  161. (* (pos_in fin) < eod *)
  162. if Int32.compare (Int64.to_int32 (LargeFile.pos_in fin)) eod < 0 then (
  163. let klen = read_le fin in
  164. let dlen = read_le fin in
  165. let key = Bytes.create klen in
  166. let data = Bytes.create dlen in
  167. really_input fin key 0 klen;
  168. really_input fin data 0 dlen;
  169. if f (key, data) then loop ())
  170. in
  171. loop ();
  172. close_in fin
  173. with x ->
  174. close_in fin;
  175. raise x
  176. type cdb_file = {
  177. f : in_channel;
  178. (* Position * length *)
  179. tables : (int32 * int) array;
  180. }
  181. let open_cdb_in (fn : string) : cdb_file =
  182. let fin = open_in_bin fn in
  183. let tables = Array.make 256 (Int32.zero, 0) in
  184. (* Set the positions and lengths *)
  185. Array.iteri
  186. (fun i _ ->
  187. let pos = read_le32 fin in
  188. let len = read_le fin in
  189. tables.(i) <- (pos, len))
  190. tables;
  191. { f = fin; tables }
  192. let close_cdb_in cdf = close_in cdf.f
  193. let find_all (cdf : cdb_file) (key : bytes) : bytes Stream.t =
  194. let kh = key |> hash in
  195. (* Find out where the hash table is *)
  196. let hpos, hlen = cdf.tables.(hash_to_table kh) and fd = cdf.f in
  197. let rec loop x =
  198. if x >= hlen then None
  199. else
  200. (* Calculate the slot containing these entries *)
  201. let lslot = (hash_to_bucket kh hlen + x) mod hlen in
  202. let spos = Int32.add (Int32.of_int (lslot * 8)) hpos in
  203. LargeFile.seek_in fd (Int64.of_int32 spos);
  204. let h = read_le32 fd in
  205. let pos = read_le32 fd in
  206. (* validate that we a real bucket *)
  207. if h = kh && Int32.compare pos Int32.zero > 0 then (
  208. LargeFile.seek_in fd (Int64.of_int32 pos);
  209. let klen = read_le fd in
  210. if klen = Bytes.length key then (
  211. let dlen = read_le fd in
  212. let rkey = Bytes.create klen in
  213. really_input fd rkey 0 klen;
  214. if rkey = key then (
  215. let rdata = Bytes.create dlen in
  216. really_input fd rdata 0 dlen;
  217. Some rdata)
  218. else loop (x + 1))
  219. else loop (x + 1))
  220. else loop (x + 1)
  221. in
  222. Stream.from loop
  223. let find_first cdf key =
  224. try Some (key |> find_all cdf |> Stream.next) with Stream.Failure -> None