WordList.cpp 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. /* WordList.cpp
  2. *
  3. * Copyright (C) 1999-2012,2015,2017 Paul Boersma
  4. *
  5. * This code is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This code is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. * See the GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this work. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. #include "WordList.h"
  19. #include "../kar/longchar.h"
  20. #include "oo_DESTROY.h"
  21. #include "WordList_def.h"
  22. #include "oo_COPY.h"
  23. #include "WordList_def.h"
  24. #include "oo_EQUAL.h"
  25. #include "WordList_def.h"
  26. #include "oo_CAN_WRITE_AS_ENCODING.h"
  27. #include "WordList_def.h"
  28. #include "oo_WRITE_TEXT.h"
  29. #include "WordList_def.h"
  30. #include "oo_READ_TEXT.h"
  31. #include "WordList_def.h"
  32. #include "oo_DESCRIPTION.h"
  33. #include "WordList_def.h"
  34. /* BUG: not Unicode-savvy */
  35. Thing_implement (WordList, Daata, 0);
  36. static integer WordList_count (WordList me) {
  37. integer n = 0;
  38. for (char32 *p = & my string [0]; *p; p ++) {
  39. if (*p == '\n') n += 1;
  40. }
  41. return n;
  42. }
  43. void structWordList :: v_info () {
  44. structDaata :: v_info ();
  45. integer n = WordList_count (this);
  46. if (our length == 0)
  47. our length = str32len (our string.get());
  48. MelderInfo_writeLine (U"Number of words: ", n);
  49. MelderInfo_writeLine (U"Number of characters: ", length - n);
  50. }
  51. void structWordList :: v_readBinary (FILE *f, int /*formatVersion*/) {
  52. char32 *current, *p;
  53. int kar = 0;
  54. our length = bingeti32 (f);
  55. if (our length < 0)
  56. Melder_throw (U"Wrong length ", our length, U".");
  57. our string = autostring32 (our length);
  58. p = current = & our string [0];
  59. if (our length > 0) {
  60. /*
  61. * Read first word.
  62. */
  63. for (;;) {
  64. if (p - & string [0] >= length - 1) break;
  65. kar = fgetc (f);
  66. if (kar == EOF)
  67. Melder_throw (U"Early end of file.");
  68. if (kar >= 128) break;
  69. *p ++ = kar;
  70. }
  71. *p ++ = U'\n';
  72. /*
  73. * Read following words.
  74. */
  75. for (;;) {
  76. char32 *previous = current;
  77. int numberOfSame = kar - 128;
  78. current = p;
  79. str32ncpy (current, previous, numberOfSame);
  80. p += numberOfSame;
  81. for (;;) {
  82. if (p - & string [0] >= length - 1) break;
  83. kar = fgetc (f);
  84. if (kar == EOF)
  85. Melder_throw (U"Early end of file.");
  86. if (kar >= 128) break;
  87. *p ++ = kar;
  88. }
  89. *p ++ = U'\n';
  90. if (p - & string [0] >= our length) break;
  91. }
  92. }
  93. *p = U'\0';
  94. if (p - & our string [0] != our length)
  95. Melder_throw (U"Length in header (", our length, U") does not match lenth of string (", (integer) (p - & our string [0]), U").");
  96. }
  97. void structWordList :: v_writeBinary (FILE *f) {
  98. integer currentLength, previousLength;
  99. if (our length == 0)
  100. our length = str32len (our string.get());
  101. binputi32 (our length, f);
  102. if (our length > 0) {
  103. char32 *current = & our string [0], *kar = current;
  104. for (kar = current; *kar != U'\n'; kar ++) { }
  105. currentLength = kar - current;
  106. for (integer i = 0; i < currentLength; i ++)
  107. fputc ((int) current [i], f); // TODO: check
  108. for (;;) {
  109. char32 *previous = current, *kar1, *kar2;
  110. int numberOfSame;
  111. previousLength = currentLength;
  112. current = previous + previousLength + 1;
  113. if (*current == U'\0') break;
  114. kar1 = previous, kar2 = current;
  115. while (*kar2 != U'\n' && *kar2 == *kar1) {
  116. kar1 ++, kar2 ++;
  117. }
  118. numberOfSame = kar2 - current;
  119. if (numberOfSame > 127) numberOfSame = 127; // clip
  120. fputc (128 + numberOfSame, f);
  121. while (*kar2 != U'\n') kar2 ++;
  122. currentLength = kar2 - current;
  123. for (integer i = 0; i < currentLength - numberOfSame; i ++)
  124. fputc ((int) current [numberOfSame + i], f); // TODO: check
  125. }
  126. }
  127. }
  128. autoWordList Strings_to_WordList (Strings me) {
  129. try {
  130. integer totalLength = 0;
  131. /*
  132. * Check whether the strings are generic and sorted.
  133. */
  134. for (integer i = 1; i <= my numberOfStrings; i ++) {
  135. char32 *string = my strings [i].get(), *p;
  136. for (p = & string [0]; *p; p ++) {
  137. if (*p > 126)
  138. Melder_throw (U"String \"", string, U"\" not generic.\nPlease convert to backslash trigraphs first.");
  139. }
  140. if (i > 1 && str32cmp (my strings [i - 1].get(), string) > 0) {
  141. Melder_throw (U"String \"", string, U"\" not sorted.\nPlease sort first.");
  142. }
  143. totalLength += str32len (string);
  144. }
  145. autoWordList thee = Thing_new (WordList);
  146. thy length = totalLength + my numberOfStrings;
  147. thy string = autostring32 (thy length);
  148. /*
  149. * Concatenate the strings into the word list.
  150. */
  151. char32 *q = & thy string [0];
  152. for (integer i = 1; i <= my numberOfStrings; i ++) {
  153. integer length = str32len (my strings [i].get());
  154. str32cpy (q, my strings [i].get());
  155. q += length;
  156. *q ++ = '\n';
  157. }
  158. *q = U'\0';
  159. Melder_assert (q - & thy string [0] == thy length);
  160. return thee;
  161. } catch (MelderError) {
  162. Melder_throw (me, U": not converted to WordList.");
  163. }
  164. }
  165. autoStrings WordList_to_Strings (WordList me) {
  166. try {
  167. unsigned char *word = (unsigned char *) & my string [0]; // BUG: explain this
  168. autoStrings thee = Thing_new (Strings);
  169. thy numberOfStrings = WordList_count (me);
  170. if (thy numberOfStrings > 0) {
  171. thy strings = autostring32vector (thy numberOfStrings);
  172. }
  173. for (integer i = 1; i <= thy numberOfStrings; i ++) {
  174. unsigned char *kar = word;
  175. for (; *kar != '\n'; kar ++) { }
  176. integer length = kar - word;
  177. thy strings [i] = autostring32 (length);
  178. str32ncpy (thy strings [i].get(), Melder_peek8to32 ((const char *) word), length);
  179. thy strings [i] [length] = U'\0';
  180. word += length + 1;
  181. }
  182. return thee;
  183. } catch (MelderError) {
  184. Melder_throw (me, U": not converted to Strings.");
  185. }
  186. }
  187. static integer gotoStart (WordList me, integer p) {
  188. if (p <= 0) return 0;
  189. -- p;
  190. while (p >= 0 && my string [p] != U'\n') p --;
  191. return p + 1;
  192. }
  193. static integer gotoNext (WordList me, integer p) {
  194. if (p >= my length - 1) return my length;
  195. while (my string [p] != U'\n') p ++;
  196. return p + 1;
  197. }
  198. static integer gotoPrevious (WordList me, integer p) {
  199. if (p <= 0) return -1;
  200. if (my string [-- p] != U'\n') return -1; // should not occur
  201. if (p <= 0) return 0; // if first word is empty
  202. -- p; // step from newline
  203. while (p >= 0 && my string [p] != U'\n') p --;
  204. return p + 1;
  205. }
  206. static int compare (conststring32 word, conststring32 p) {
  207. for (;;) {
  208. if (*word == U'\0') {
  209. if (*p == U'\n') return 0;
  210. else return -1; // word is substring of p
  211. }
  212. if (*p == U'\n') return +1; // p is substring of word
  213. if (*word < *p) return -1;
  214. if (*word > *p) return +1;
  215. word ++, p ++;
  216. }
  217. return 0; // should not occur
  218. }
  219. static char32 buffer [3333+1];
  220. bool WordList_hasWord (WordList me, conststring32 word) {
  221. if (str32len (word) > 3333)
  222. return false;
  223. Longchar_genericize32 (word, buffer);
  224. if (! my length)
  225. my length = str32len (my string.get());
  226. integer p = my length / 2, d = p / 2;
  227. while (d > 20) {
  228. p = gotoStart (me, p);
  229. int cf = compare (buffer, my string.get() + p);
  230. if (cf == 0) return true;
  231. if (cf < 0) p -= d; else p += d;
  232. d /= 2;
  233. }
  234. p = gotoStart (me, p);
  235. int cf = compare (buffer, my string.get() + p);
  236. if (cf == 0) return true;
  237. if (cf > 0) {
  238. for (;;) {
  239. p = gotoNext (me, p);
  240. if (p >= my length) return false;
  241. cf = compare (buffer, my string.get() + p);
  242. if (cf == 0) return true;
  243. if (cf < 0) return false;
  244. }
  245. } else {
  246. for (;;) {
  247. p = gotoPrevious (me, p);
  248. if (p < 0) return false;
  249. cf = compare (buffer, my string.get() + p);
  250. if (cf == 0) return true;
  251. if (cf > 0) return false;
  252. }
  253. }
  254. return false; // should not occur
  255. }
  256. /* End of file WordList.cpp */