SpellingChecker.cpp 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /* SpellingChecker.cpp
  2. *
  3. * Copyright (C) 1999-2007,2011,2012,2015-2018 Paul Boersma
  4. *
  5. * This code is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This code is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. * See the GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this work. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. #include "SpellingChecker.h"
  19. #include <ctype.h>
  20. #include "oo_DESTROY.h"
  21. #include "SpellingChecker_def.h"
  22. #include "oo_COPY.h"
  23. #include "SpellingChecker_def.h"
  24. #include "oo_EQUAL.h"
  25. #include "SpellingChecker_def.h"
  26. #include "oo_CAN_WRITE_AS_ENCODING.h"
  27. #include "SpellingChecker_def.h"
  28. #include "oo_WRITE_TEXT.h"
  29. #include "SpellingChecker_def.h"
  30. #include "oo_WRITE_BINARY.h"
  31. #include "SpellingChecker_def.h"
  32. #include "oo_READ_TEXT.h"
  33. #include "SpellingChecker_def.h"
  34. #include "oo_READ_BINARY.h"
  35. #include "SpellingChecker_def.h"
  36. #include "oo_DESCRIPTION.h"
  37. #include "SpellingChecker_def.h"
  38. #include "../kar/longchar.h"
  39. Thing_implement (SpellingChecker, Daata, 0);
  40. autoSpellingChecker WordList_upto_SpellingChecker (WordList me) {
  41. try {
  42. autoSpellingChecker thee = Thing_new (SpellingChecker);
  43. thy wordList = Data_copy (me);
  44. thy userDictionary = StringSet_create ();
  45. thy separatingCharacters = Melder_dup (U".,;:()\"");
  46. return thee;
  47. } catch (MelderError) {
  48. Melder_throw (me, U": not converted to SpellingChecker.");
  49. }
  50. }
  51. autoWordList SpellingChecker_extractWordList (SpellingChecker me) {
  52. return Data_copy (my wordList.get());
  53. }
  54. void SpellingChecker_replaceWordList (SpellingChecker me, WordList list) {
  55. try {
  56. my wordList = Data_copy (list);
  57. } catch (MelderError) {
  58. Melder_throw (me, U": word list not replaced.");
  59. }
  60. }
  61. autoStringSet SpellingChecker_extractUserDictionary (SpellingChecker me) {
  62. try {
  63. if (my userDictionary->size == 0)
  64. Melder_throw (U"This spelling checker does not contain a user dictionary.");
  65. return Data_copy (my userDictionary.get());
  66. } catch (MelderError) {
  67. Melder_throw (me, U": user dictionary not extracted.");
  68. }
  69. }
  70. void SpellingChecker_replaceUserDictionary (SpellingChecker me, StringSet userDictionary) {
  71. try {
  72. my userDictionary = Data_copy (userDictionary);
  73. } catch (MelderError) {
  74. Melder_throw (me, U": user dictionary not replaced.");
  75. }
  76. }
  77. static int startsWithCapital (conststring32 word) {
  78. return iswupper ((int) word [0]) || (word [0] == '\\' && iswupper ((int) word [1]));
  79. }
  80. bool SpellingChecker_isWordAllowed (SpellingChecker me, conststring32 word) {
  81. int wordLength = str32len (word);
  82. if (my allowAllWordsContaining && my allowAllWordsContaining [0]) {
  83. char32 *p = & my allowAllWordsContaining [0];
  84. while (*p) {
  85. /*
  86. * Find next token in list of allowed string parts.
  87. */
  88. char32 token [100], *q = & token [0];
  89. /*
  90. * Skip spaces in list.
  91. */
  92. while (*p == U' ') p ++;
  93. /*
  94. * Collect one token string from list.
  95. */
  96. while (*p != U'\0' && *p != U' ') {
  97. *q ++ = *p ++;
  98. }
  99. *q = U'\0'; // trailing null character
  100. /*
  101. * Allow word if it contains this token.
  102. */
  103. if (str32str (word, token)) return true;
  104. }
  105. }
  106. if (my allowAllNames) {
  107. /*
  108. * Allow word if it starts with a capital.
  109. */
  110. if (startsWithCapital (word)) {
  111. return true;
  112. }
  113. if (my namePrefixes && my namePrefixes [0]) {
  114. char32 *p = & my namePrefixes [0];
  115. while (*p) {
  116. char32 token [100], *q = & token [0];
  117. while (*p == U' ') p ++;
  118. while (*p != U'\0' && *p != U' ') *q ++ = *p ++;
  119. *q = U'\0'; // trailing null character
  120. /*
  121. * Allow word if starts with this prefix
  122. * and this prefix is followed by a capital.
  123. */
  124. if (str32str (word, token) == word && startsWithCapital (word + str32len (token))) {
  125. return true;
  126. }
  127. }
  128. }
  129. } else if (my allowAllAbbreviations && startsWithCapital (word)) {
  130. const char32 *p = & word [0];
  131. for (;;) {
  132. if (*p == '\0') return true;
  133. if (iswlower ((int) *p)) break;
  134. p ++;
  135. }
  136. }
  137. if (my allowAllWordsStartingWith && my allowAllWordsStartingWith [0]) {
  138. const char32 *p = & my allowAllWordsStartingWith [0];
  139. while (*p) {
  140. char32 token [100], *q = & token [0];
  141. int tokenLength;
  142. while (*p == U' ') p ++;
  143. while (*p != U'\0' && *p != U' ') *q ++ = *p ++;
  144. *q = U'\0'; // trailing null character
  145. tokenLength = str32len (token);
  146. if (wordLength >= tokenLength && str32nequ (token, word, tokenLength)) {
  147. return true;
  148. }
  149. }
  150. }
  151. if (my allowAllWordsEndingIn && my allowAllWordsEndingIn [0]) {
  152. const char32 *p = & my allowAllWordsEndingIn [0];
  153. while (*p) {
  154. char32 token [100], *q = & token [0];
  155. int tokenLength;
  156. while (*p == U' ') p ++;
  157. while (*p != U'\0' && *p != U' ') *q ++ = *p ++;
  158. *q = U'\0'; // trailing null character
  159. tokenLength = str32len (token);
  160. if (wordLength >= tokenLength && str32nequ (token, word + wordLength - tokenLength, tokenLength)) {
  161. return true;
  162. }
  163. }
  164. }
  165. if (WordList_hasWord (my wordList.get(), word))
  166. return true;
  167. if (my userDictionary->size > 0) {
  168. if (str32len (word) > 3333) return false; // superfluous, because WordList_hasWord already checked; but safe
  169. static char32 buffer [3*3333+1];
  170. Longchar_genericize32 (word, buffer);
  171. if (my userDictionary -> lookUp (buffer) != 0)
  172. return true;
  173. }
  174. return false;
  175. }
  176. void SpellingChecker_addNewWord (SpellingChecker me, conststring32 word) {
  177. try {
  178. autostring32 generic (3 * str32len (word));
  179. Longchar_genericize32 (word, generic.get());
  180. my userDictionary -> addString_copy (generic.get());
  181. } catch (MelderError) {
  182. Melder_throw (me, U": word \"", word, U"\" not added.");
  183. }
  184. }
  185. char32 * SpellingChecker_nextNotAllowedWord (SpellingChecker me, conststring32 sentence, integer *start) {
  186. const char32 *p = & sentence [*start];
  187. for (;;) {
  188. if (*p == U'\0') {
  189. return nullptr; // all words allowed
  190. } else if (*p == U'(' && my allowAllParenthesized) {
  191. p ++;
  192. for (;;) {
  193. if (*p == U'\0') {
  194. return nullptr; // everything is parenthesized...
  195. } else if (*p == U')') {
  196. p ++;
  197. break;
  198. } else {
  199. p ++;
  200. }
  201. }
  202. } else if (*p == U' ' || (my separatingCharacters && str32chr (my separatingCharacters.get(), *p))) {
  203. p ++;
  204. } else {
  205. static char32 word [100];
  206. char32 *q = & word [0];
  207. *start = p - sentence;
  208. for (;;) {
  209. if (*p == U'\0' || *p == U' ' || (my separatingCharacters && str32chr (my separatingCharacters.get(), *p))) {
  210. *q ++ = U'\0';
  211. if (SpellingChecker_isWordAllowed (me, word)) {
  212. /* Don't increment p (may contain a zero or a parenthesis). */
  213. break;
  214. } else {
  215. return word;
  216. }
  217. } else {
  218. *q ++ = *p ++;
  219. }
  220. }
  221. }
  222. }
  223. return nullptr; // all words allowed
  224. }
  225. /* End of file SpellingChecker.cpp */