tag.ml 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. (*
  2. * _ _ ____ _
  3. * _| || |_/ ___| ___ _ __ _ __ ___ | |
  4. * |_ .. _\___ \ / _ \ '_ \| '_ \ / _ \| |
  5. * |_ _|___) | __/ |_) | |_) | (_) |_|
  6. * |_||_| |____/ \___| .__/| .__/ \___/(_)
  7. * |_| |_|
  8. *
  9. * Personal Social Web.
  10. *
  11. * Copyright (C) The #Seppo contributors. All rights reserved.
  12. *
  13. * This program is free software: you can redistribute it and/or modify
  14. * it under the terms of the GNU General Public License as published by
  15. * the Free Software Foundation, either version 3 of the License, or
  16. * (at your option) any later version.
  17. *
  18. * This program is distributed in the hope that it will be useful,
  19. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  21. * GNU General Public License for more details.
  22. *
  23. * You should have received a copy of the GNU General Public License
  24. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  25. *)
  26. (* https://www.unicode.org/reports/tr31/#D2 *)
  27. type tag = Tag of string
  28. type state = Ready | Continue | Medial | Ignore
  29. (*
  30. * https://www.unicode.org/reports/tr31/#D2
  31. *)
  32. let of_string s =
  33. let is_start uc =
  34. match Uchar.to_int uc with
  35. | 0x0023 (* '#' *)
  36. (* | 0xFE5F | 0xFF03 *) -> true
  37. | _ -> false
  38. in
  39. let is_continue uc =
  40. (Uucp.Id.is_xid_continue uc
  41. (* || Uucp.Emoji.is_extended_pictographic uc
  42. || Uucp.Emoji.is_emoji_component uc *)
  43. ||
  44. match Uchar.to_int uc with
  45. | 0x002B (* '+' *)
  46. | 0x002D (* '-' *)
  47. | 0x005F (* '_' *)
  48. | 0x200D (* zero width joiner *) ->
  49. true
  50. | _ -> false)
  51. && not (is_start uc)
  52. and is_medial uc =
  53. (* https://www.unicode.org/reports/tr31/#Table_Optional_Medial *)
  54. match Uchar.to_int uc with
  55. | 0x0040 (* '@' *) | 0x00A7 (* '§' *) | 0x2020 (* '†' *) -> true
  56. | _ -> false
  57. in
  58. let b = Buffer.create 42 in
  59. let flush_segment acc =
  60. let segment = Buffer.contents b in
  61. Buffer.clear b;
  62. if segment = "" || segment = "#"
  63. then acc
  64. else Tag segment :: acc
  65. and buffer u acc =
  66. Uutf.Buffer.add_utf_8 b u;
  67. acc
  68. in
  69. let each_uchar (st, acc) _ = function
  70. | `Malformed _ -> (st, acc)
  71. | `Uchar u -> (
  72. match st with
  73. | Ready ->
  74. if
  75. is_start u
  76. (* start with emoji even without prior # *)
  77. (* || Uucp.Emoji.is_extended_pictographic u *)
  78. then (Continue, buffer u acc)
  79. else if is_continue u then (Ignore, acc)
  80. else (Ready, acc)
  81. | Continue ->
  82. if is_continue u then (Continue, buffer u acc)
  83. else if is_medial u then (Medial, buffer u acc)
  84. else (Ready, flush_segment acc)
  85. | Medial ->
  86. if is_continue u then (Continue, buffer u acc)
  87. else (Ready, flush_segment acc)
  88. | Ignore -> if is_continue u then (Ignore, acc) else (Ready, acc))
  89. in
  90. let _, ret = Uutf.String.fold_utf_8 each_uchar (Ready, []) s in
  91. flush_segment ret |> List.rev
  92. (* https://codeberg.org/mro/ShaarliGo/src/branch/master/tags.go#L104 *)
  93. let fold (Tag s) =
  94. (* https://erratique.ch/software/uunf/doc/Uunf/index.html#utf8
  95. * https://erratique.ch/software/uutf/doc/Uutf/String/ *)
  96. let utf8_norm_filter pred nf s =
  97. let b = Buffer.create (String.length s * 3) in
  98. let n = Uunf.create nf in
  99. let rec add v =
  100. match Uunf.add n v with
  101. | `Uchar u ->
  102. if pred u then Uutf.Buffer.add_utf_8 b u;
  103. add `Await
  104. | `Await | `End -> ()
  105. in
  106. let add_uchar (_ : unit) (_ : int) = function
  107. | `Malformed _ -> add (`Uchar Uutf.u_rep)
  108. | `Uchar _ as u -> add u
  109. in
  110. Uutf.String.fold_utf_8 add_uchar () s;
  111. add `End;
  112. Buffer.contents b
  113. in
  114. s
  115. |> utf8_norm_filter (fun u -> `Mn != Uucp.Gc.general_category u) `NFD
  116. |> Uunf_string.normalize_utf_8 `NFC
  117. |> String.lowercase_ascii
  118. let diff cmp a_srt b_srt =
  119. let rec f a b (same, plus, minus) =
  120. match (a, b) with
  121. | [], _ -> (same, plus |> List.rev_append b, minus)
  122. | _, [] -> (same, plus, minus |> List.rev_append a)
  123. | ah :: at, bh :: bt ->
  124. let cm = cmp ah bh in
  125. if cm < 0 then f at b (same, plus, ah :: minus)
  126. else if cm > 0 then f a bt (same, bh :: plus, minus)
  127. else f at bt (ah :: same, plus, minus)
  128. in
  129. let r0, r1, r2 = f a_srt b_srt ([], [], []) in
  130. (r0 |> List.rev, r1 |> List.rev, r2 |> List.rev)
  131. (* LUT for folded keys -> label writing *)
  132. module Tmap = Map.Make (String)
  133. let add_tag v m =
  134. let k = fold v in
  135. match Tmap.find_opt k m with
  136. (* add only if not already there *)
  137. | None -> Tmap.add k v m
  138. | Some _ -> m
  139. let add_tag_list (v : tag list) m =
  140. let fkt m t = add_tag t m in
  141. List.fold_left fkt m v
  142. let add_tag_seq (v : tag Seq.t) m =
  143. let fkt m t = add_tag t m in
  144. Seq.fold_left fkt m v
  145. (* Find all tags in their existing spelling and append to the body if necessary.
  146. *
  147. * Data:
  148. * - title (line)
  149. * - body (multiline)
  150. * - tags list
  151. * - lookup evtl. existing Tag -> Tag with 'fold' equality or add it
  152. *
  153. * https://codeberg.org/mro/ShaarliGo/src/branch/master/tags.go#L124
  154. * https://discuss.ocaml.org/t/associative-stuff-ocaml-api/9870/3?u=mro
  155. *)
  156. let normalise0 short long tags lut f_add f_find : string * string * tag list =
  157. let txt = short |> of_string |> List.rev_append (long |> of_string) in
  158. let lut = lut |> f_add tags |> f_add txt in
  159. let luf v = lut |> f_find (fold v) in
  160. let tags = tags |> List.rev_map luf
  161. and txt = txt |> List.rev_map luf
  162. and tcmp (Tag a) (Tag b) = String.compare a b in
  163. let tsrt = List.sort_uniq tcmp in
  164. let _same, plus, minus = txt |> tsrt |> diff tcmp (tags |> tsrt) in
  165. let long =
  166. match minus with
  167. | [] -> long
  168. | ls ->
  169. long ^ "\n" ^ (ls |> List.map (fun (Tag t) -> t) |> String.concat " ")
  170. in
  171. (short, long, tags |> List.rev_append plus |> tsrt)
  172. let normalise short long tags (lut : tag Tmap.t) : string * string * tag list =
  173. normalise0 short long tags lut add_tag_list Tmap.find
  174. let slurp_channel ic =
  175. let chunk = 4 * 0x400 in
  176. let b = Buffer.create chunk in
  177. (try Buffer.add_channel b ic chunk with End_of_file -> ());
  178. b |> Buffer.to_bytes |> Bytes.to_string
  179. let sift_channel ic = Ok (ic |> slurp_channel |> of_string)
  180. let cdb = Mapcdb.Cdb "app/var/cache/tags.cdb"
  181. (** use a cdb as a backing map (store).
  182. *
  183. * Mapcdb cannot satisfy a Map.Make (String) yet
  184. *)
  185. let cdb_normalise short long tags (lut : Mapcdb.cdb) =
  186. let f_add (v : tag list) lut =
  187. let keep _ = true in
  188. let fkt_add_all (add1 : ((bytes*bytes) -> unit)) =
  189. let _added = v |> List.fold_left (fun lut' item ->
  190. let k = item |> fold |> Bytes.of_string in
  191. (match Mapcdb.find_opt k lut' with
  192. | None ->
  193. let (Tag v) = item in
  194. add1 (k,v |> Bytes.of_string);
  195. | Some _ -> ());
  196. lut'
  197. ) lut in
  198. ()
  199. in
  200. Mapcdb.add_many keep fkt_add_all lut
  201. in
  202. let f_find s lut =
  203. let s' = (Tag s) |> fold in
  204. Tag (match Mapcdb.find_string_opt s' lut with
  205. | None -> s
  206. | Some s -> s)
  207. in
  208. normalise0 short long tags lut f_add f_find