filteredbrk.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2014-2015, International Business Machines Corporation and
  6. * others. All Rights Reserved.
  7. *******************************************************************************
  8. */
  9. #include "unicode/utypes.h"
  10. #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
  11. #include "cmemory.h"
  12. #include "unicode/filteredbrk.h"
  13. #include "unicode/ucharstriebuilder.h"
  14. #include "unicode/ures.h"
  15. #include "uresimp.h" // ures_getByKeyWithFallback
  16. #include "ubrkimpl.h" // U_ICUDATA_BRKITR
  17. #include "uvector.h"
  18. #include "cmemory.h"
  19. #include "umutex.h"
  20. U_NAMESPACE_BEGIN
  21. #ifndef FB_DEBUG
  22. #define FB_DEBUG 0
  23. #endif
  24. #if FB_DEBUG
  25. #include <stdio.h>
  26. static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
  27. char buf[2048];
  28. if(s) {
  29. s->extract(0,s->length(),buf,2048);
  30. } else {
  31. strcpy(buf,"nullptr");
  32. }
  33. fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
  34. f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
  35. }
  36. #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
  37. #else
  38. #define FB_TRACE(m,s,b,d)
  39. #endif
  40. /**
  41. * Used with sortedInsert()
  42. */
  43. static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
  44. const UnicodeString &a = *(const UnicodeString*)t1.pointer;
  45. const UnicodeString &b = *(const UnicodeString*)t2.pointer;
  46. return a.compare(b);
  47. }
  48. /**
  49. * A UVector which implements a set of strings.
  50. */
  51. class UStringSet : public UVector {
  52. public:
  53. UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
  54. uhash_compareUnicodeString,
  55. 1,
  56. status) {}
  57. virtual ~UStringSet();
  58. /**
  59. * Is this UnicodeSet contained?
  60. */
  61. inline UBool contains(const UnicodeString& s) {
  62. return contains((void*) &s);
  63. }
  64. using UVector::contains;
  65. /**
  66. * Return the ith UnicodeString alias
  67. */
  68. inline const UnicodeString* getStringAt(int32_t i) const {
  69. return (const UnicodeString*)elementAt(i);
  70. }
  71. /**
  72. * Adopt the UnicodeString if not already contained.
  73. * Caller no longer owns the pointer in any case.
  74. * @return true if adopted successfully, false otherwise (error, or else duplicate)
  75. */
  76. inline UBool adopt(UnicodeString *str, UErrorCode &status) {
  77. if(U_FAILURE(status) || contains(*str)) {
  78. delete str;
  79. return false;
  80. } else {
  81. sortedInsert(str, compareUnicodeString, status);
  82. if(U_FAILURE(status)) {
  83. return false;
  84. }
  85. return true;
  86. }
  87. }
  88. /**
  89. * Add by value.
  90. * @return true if successfully adopted.
  91. */
  92. inline UBool add(const UnicodeString& str, UErrorCode &status) {
  93. if(U_FAILURE(status)) return false;
  94. UnicodeString *t = new UnicodeString(str);
  95. if(t==nullptr) {
  96. status = U_MEMORY_ALLOCATION_ERROR; return false;
  97. }
  98. return adopt(t, status);
  99. }
  100. /**
  101. * Remove this string.
  102. * @return true if successfully removed, false otherwise (error, or else it wasn't there)
  103. */
  104. inline UBool remove(const UnicodeString &s, UErrorCode &status) {
  105. if(U_FAILURE(status)) return false;
  106. return removeElement((void*) &s);
  107. }
  108. };
  109. /**
  110. * Virtual, won't be inlined
  111. */
  112. UStringSet::~UStringSet() {}
  113. /* ----------------------------------------------------------- */
  114. /* Filtered Break constants */
  115. static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
  116. static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
  117. static const int32_t kSuppressInReverse = (1<<0);
  118. static const int32_t kAddToForward = (1<<1);
  119. static const char16_t kFULLSTOP = 0x002E; // '.'
  120. /**
  121. * Shared data for SimpleFilteredSentenceBreakIterator
  122. */
  123. class SimpleFilteredSentenceBreakData : public UMemory {
  124. public:
  125. SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
  126. : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
  127. SimpleFilteredSentenceBreakData *incr() {
  128. umtx_atomic_inc(&refcount);
  129. return this;
  130. }
  131. SimpleFilteredSentenceBreakData *decr() {
  132. if(umtx_atomic_dec(&refcount) <= 0) {
  133. delete this;
  134. }
  135. return nullptr;
  136. }
  137. virtual ~SimpleFilteredSentenceBreakData();
  138. bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
  139. bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
  140. const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
  141. const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
  142. private:
  143. // These tries own their data arrays.
  144. // They are shared and must therefore not be modified.
  145. LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
  146. LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
  147. u_atomic_int32_t refcount;
  148. };
  149. SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
  150. /**
  151. * Concrete implementation
  152. */
  153. class SimpleFilteredSentenceBreakIterator : public BreakIterator {
  154. public:
  155. SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
  156. SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
  157. virtual ~SimpleFilteredSentenceBreakIterator();
  158. private:
  159. SimpleFilteredSentenceBreakData *fData;
  160. LocalPointer<BreakIterator> fDelegate;
  161. LocalUTextPointer fText;
  162. /* -- subclass interface -- */
  163. public:
  164. /* -- cloning and other subclass stuff -- */
  165. virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
  166. int32_t &/*BufferSize*/,
  167. UErrorCode &status) override {
  168. // for now - always deep clone
  169. status = U_SAFECLONE_ALLOCATED_WARNING;
  170. return clone();
  171. }
  172. virtual SimpleFilteredSentenceBreakIterator* clone() const override { return new SimpleFilteredSentenceBreakIterator(*this); }
  173. virtual UClassID getDynamicClassID() const override { return nullptr; }
  174. virtual bool operator==(const BreakIterator& o) const override { if(this==&o) return true; return false; }
  175. /* -- text modifying -- */
  176. virtual void setText(UText *text, UErrorCode &status) override { fDelegate->setText(text,status); }
  177. virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) override { fDelegate->refreshInputText(input,status); return *this; }
  178. virtual void adoptText(CharacterIterator* it) override { fDelegate->adoptText(it); }
  179. virtual void setText(const UnicodeString &text) override { fDelegate->setText(text); }
  180. /* -- other functions that are just delegated -- */
  181. virtual UText *getUText(UText *fillIn, UErrorCode &status) const override { return fDelegate->getUText(fillIn,status); }
  182. virtual CharacterIterator& getText() const override { return fDelegate->getText(); }
  183. /* -- ITERATION -- */
  184. virtual int32_t first() override;
  185. virtual int32_t preceding(int32_t offset) override;
  186. virtual int32_t previous() override;
  187. virtual UBool isBoundary(int32_t offset) override;
  188. virtual int32_t current() const override { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
  189. virtual int32_t next() override;
  190. virtual int32_t next(int32_t n) override;
  191. virtual int32_t following(int32_t offset) override;
  192. virtual int32_t last() override;
  193. private:
  194. /**
  195. * Given that the fDelegate has already given its "initial" answer,
  196. * find the NEXT actual (non-excepted) break.
  197. * @param n initial position from delegate
  198. * @return new break position or UBRK_DONE
  199. */
  200. int32_t internalNext(int32_t n);
  201. /**
  202. * Given that the fDelegate has already given its "initial" answer,
  203. * find the PREV actual (non-excepted) break.
  204. * @param n initial position from delegate
  205. * @return new break position or UBRK_DONE
  206. */
  207. int32_t internalPrev(int32_t n);
  208. /**
  209. * set up the UText with the value of the fDelegate.
  210. * Call this before calling breakExceptionAt.
  211. * May be able to avoid excess calls
  212. */
  213. void resetState(UErrorCode &status);
  214. /**
  215. * Is there a match (exception) at this spot?
  216. */
  217. enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
  218. /**
  219. * Determine if there is an exception at this spot
  220. * @param n spot to check
  221. * @return kNoExceptionHere or kExceptionHere
  222. **/
  223. enum EFBMatchResult breakExceptionAt(int32_t n);
  224. };
  225. SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
  226. : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
  227. {
  228. }
  229. SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
  230. BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
  231. fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
  232. fDelegate(adopt)
  233. {
  234. if (fData == nullptr) {
  235. delete forwards;
  236. delete backwards;
  237. if (U_SUCCESS(status)) {
  238. status = U_MEMORY_ALLOCATION_ERROR;
  239. }
  240. }
  241. }
  242. SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
  243. fData = fData->decr();
  244. }
  245. void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
  246. fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
  247. }
  248. SimpleFilteredSentenceBreakIterator::EFBMatchResult
  249. SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
  250. int64_t bestPosn = -1;
  251. int32_t bestValue = -1;
  252. // loops while 'n' points to an exception.
  253. utext_setNativeIndex(fText.getAlias(), n); // from n..
  254. //if(debug2) u_printf(" n@ %d\n", n);
  255. // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
  256. if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here??
  257. // TODO only do this the 1st time?
  258. //if(debug2) u_printf("skipping prev: |%C| \n", (char16_t)uch);
  259. } else {
  260. //if(debug2) u_printf("not skipping prev: |%C| \n", (char16_t)uch);
  261. utext_next32(fText.getAlias());
  262. //if(debug2) u_printf(" -> : |%C| \n", (char16_t)uch);
  263. }
  264. {
  265. // Do not modify the shared trie!
  266. UCharsTrie iter(fData->getBackwardsTrie());
  267. UChar32 uch;
  268. while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards
  269. UStringTrieResult r = iter.nextForCodePoint(uch);
  270. if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
  271. bestPosn = utext_getNativeIndex(fText.getAlias());
  272. bestValue = iter.getValue();
  273. }
  274. if(!USTRINGTRIE_HAS_NEXT(r)) {
  275. break;
  276. }
  277. //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (char16_t)uch, r, utext_getNativeIndex(fText.getAlias()));
  278. }
  279. }
  280. //if(bestValue >= 0) {
  281. //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
  282. //}
  283. if(bestPosn>=0) {
  284. //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
  285. //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
  286. //int32_t bestValue = iter.getValue();
  287. ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (char16_t)uch, r, bestValue);
  288. if(bestValue == kMATCH) { // exact match!
  289. //if(debug2) u_printf(" exact backward match\n");
  290. return kExceptionHere; // See if the next is another exception.
  291. } else if(bestValue == kPARTIAL
  292. && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
  293. //if(debug2) u_printf(" partial backward match\n");
  294. // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
  295. // to see if it matches something going forward.
  296. UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
  297. utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
  298. //if(debug2) u_printf("Retrying at %d\n", bestPosn);
  299. // Do not modify the shared trie!
  300. UCharsTrie iter(fData->getForwardsPartialTrie());
  301. UChar32 uch;
  302. while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
  303. USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
  304. //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (char16_t)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
  305. }
  306. if(USTRINGTRIE_MATCHES(rfwd)) {
  307. //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (char16_t)uch);
  308. // only full matches here, nothing to check
  309. // skip the next:
  310. return kExceptionHere;
  311. } else {
  312. //if(debug2) u_printf("fwd> /%C/ no match.\n", (char16_t)uch);
  313. // no match (no exception) -return the 'underlying' break
  314. return kNoExceptionHere;
  315. }
  316. } else {
  317. return kNoExceptionHere; // internal error and/or no forwards trie
  318. }
  319. } else {
  320. //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (char16_t)uch, r); // no best match
  321. return kNoExceptionHere; // No match - so exit. Not an exception.
  322. }
  323. }
  324. // the workhorse single next.
  325. int32_t
  326. SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
  327. if(n == UBRK_DONE || // at end or
  328. !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
  329. return n;
  330. }
  331. // OK, do we need to break here?
  332. UErrorCode status = U_ZERO_ERROR;
  333. // refresh text
  334. resetState(status);
  335. if(U_FAILURE(status)) return UBRK_DONE; // bail out
  336. int64_t utextLen = utext_nativeLength(fText.getAlias());
  337. //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
  338. while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
  339. SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
  340. switch(m) {
  341. case kExceptionHere:
  342. n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
  343. continue;
  344. default:
  345. case kNoExceptionHere:
  346. return n;
  347. }
  348. }
  349. return n;
  350. }
  351. int32_t
  352. SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
  353. if(n == 0 || n == UBRK_DONE || // at end or
  354. !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
  355. return n;
  356. }
  357. // OK, do we need to break here?
  358. UErrorCode status = U_ZERO_ERROR;
  359. // refresh text
  360. resetState(status);
  361. if(U_FAILURE(status)) return UBRK_DONE; // bail out
  362. //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
  363. while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
  364. SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
  365. switch(m) {
  366. case kExceptionHere:
  367. n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
  368. continue;
  369. default:
  370. case kNoExceptionHere:
  371. return n;
  372. }
  373. }
  374. return n;
  375. }
  376. int32_t
  377. SimpleFilteredSentenceBreakIterator::next() {
  378. return internalNext(fDelegate->next());
  379. }
  380. int32_t
  381. SimpleFilteredSentenceBreakIterator::first() {
  382. // Don't suppress a break opportunity at the beginning of text.
  383. return fDelegate->first();
  384. }
  385. int32_t
  386. SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
  387. return internalPrev(fDelegate->preceding(offset));
  388. }
  389. int32_t
  390. SimpleFilteredSentenceBreakIterator::previous() {
  391. return internalPrev(fDelegate->previous());
  392. }
  393. UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
  394. if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
  395. if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
  396. UErrorCode status = U_ZERO_ERROR;
  397. resetState(status);
  398. SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
  399. switch(m) {
  400. case kExceptionHere:
  401. return false;
  402. default:
  403. case kNoExceptionHere:
  404. return true;
  405. }
  406. }
  407. int32_t
  408. SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
  409. return internalNext(fDelegate->next(offset));
  410. }
  411. int32_t
  412. SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
  413. return internalNext(fDelegate->following(offset));
  414. }
  415. int32_t
  416. SimpleFilteredSentenceBreakIterator::last() {
  417. // Don't suppress a break opportunity at the end of text.
  418. return fDelegate->last();
  419. }
  420. /**
  421. * Concrete implementation of builder class.
  422. */
  423. class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
  424. public:
  425. virtual ~SimpleFilteredBreakIteratorBuilder();
  426. SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
  427. SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
  428. virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
  429. virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
  430. virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) override;
  431. private:
  432. UStringSet fSet;
  433. };
  434. SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
  435. {
  436. }
  437. SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
  438. : fSet(status)
  439. {
  440. }
  441. SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
  442. : fSet(status)
  443. {
  444. if(U_SUCCESS(status)) {
  445. UErrorCode subStatus = U_ZERO_ERROR;
  446. LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
  447. if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
  448. status = subStatus; // copy the failing status
  449. #if FB_DEBUG
  450. fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
  451. #endif
  452. return; // leaves the builder empty, if you try to use it.
  453. }
  454. LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", nullptr, &subStatus));
  455. if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
  456. status = subStatus; // copy the failing status
  457. #if FB_DEBUG
  458. fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
  459. #endif
  460. return; // leaves the builder empty, if you try to use it.
  461. }
  462. LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", nullptr, &subStatus));
  463. #if FB_DEBUG
  464. {
  465. UErrorCode subsub = subStatus;
  466. fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
  467. }
  468. #endif
  469. if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
  470. status = subStatus; // copy the failing status
  471. #if FB_DEBUG
  472. fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
  473. #endif
  474. return; // leaves the builder empty, if you try to use it.
  475. }
  476. LocalUResourceBundlePointer strs;
  477. subStatus = status; // Pick up inherited warning status now
  478. do {
  479. strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
  480. if(strs.isValid() && U_SUCCESS(subStatus)) {
  481. UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
  482. suppressBreakAfter(str, status); // load the string
  483. }
  484. } while (strs.isValid() && U_SUCCESS(subStatus));
  485. if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
  486. status = subStatus;
  487. }
  488. }
  489. }
  490. UBool
  491. SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
  492. {
  493. UBool r = fSet.add(exception, status);
  494. FB_TRACE("suppressBreakAfter",&exception,r,0);
  495. return r;
  496. }
  497. UBool
  498. SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
  499. {
  500. UBool r = fSet.remove(exception, status);
  501. FB_TRACE("unsuppressBreakAfter",&exception,r,0);
  502. return r;
  503. }
  504. /**
  505. * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
  506. * Work around this.
  507. *
  508. * Note: "new UnicodeString[subCount]" ends up calling global operator new
  509. * on MSVC2012 for some reason.
  510. */
  511. static inline UnicodeString* newUnicodeStringArray(size_t count) {
  512. return new UnicodeString[count ? count : 1];
  513. }
  514. BreakIterator *
  515. SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
  516. LocalPointer<BreakIterator> adopt(adoptBreakIterator);
  517. LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
  518. LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
  519. if(U_FAILURE(status)) {
  520. return nullptr;
  521. }
  522. int32_t revCount = 0;
  523. int32_t fwdCount = 0;
  524. int32_t subCount = fSet.size();
  525. UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
  526. LocalArray<UnicodeString> ustrs(ustrs_ptr);
  527. LocalMemory<int> partials;
  528. partials.allocateInsteadAndReset(subCount);
  529. LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
  530. LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
  531. int n=0;
  532. for ( int32_t i = 0;
  533. i<fSet.size();
  534. i++) {
  535. const UnicodeString *abbr = fSet.getStringAt(i);
  536. if(abbr) {
  537. FB_TRACE("build",abbr,true,i);
  538. ustrs[n] = *abbr; // copy by value
  539. FB_TRACE("ustrs[n]",&ustrs[n],true,i);
  540. } else {
  541. FB_TRACE("build",abbr,false,i);
  542. status = U_MEMORY_ALLOCATION_ERROR;
  543. return nullptr;
  544. }
  545. partials[n] = 0; // default: not partial
  546. n++;
  547. }
  548. // first pass - find partials.
  549. for(int i=0;i<subCount;i++) {
  550. int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
  551. if(nn>-1 && (nn+1)!=ustrs[i].length()) {
  552. FB_TRACE("partial",&ustrs[i],false,i);
  553. // is partial.
  554. // is it unique?
  555. int sameAs = -1;
  556. for(int j=0;j<subCount;j++) {
  557. if(j==i) continue;
  558. if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
  559. FB_TRACE("prefix",&ustrs[j],false,nn+1);
  560. //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
  561. if(partials[j]==0) { // hasn't been processed yet
  562. partials[j] = kSuppressInReverse | kAddToForward;
  563. FB_TRACE("suppressing",&ustrs[j],false,j);
  564. } else if(partials[j] & kSuppressInReverse) {
  565. sameAs = j; // the other entry is already in the reverse table.
  566. }
  567. }
  568. }
  569. FB_TRACE("for partial same-",&ustrs[i],false,sameAs);
  570. FB_TRACE(" == partial #",&ustrs[i],false,partials[i]);
  571. UnicodeString prefix(ustrs[i], 0, nn+1);
  572. if(sameAs == -1 && partials[i] == 0) {
  573. // first one - add the prefix to the reverse table.
  574. prefix.reverse();
  575. builder->add(prefix, kPARTIAL, status);
  576. revCount++;
  577. FB_TRACE("Added partial",&prefix,false, i);
  578. FB_TRACE(u_errorName(status),&ustrs[i],false,i);
  579. partials[i] = kSuppressInReverse | kAddToForward;
  580. } else {
  581. FB_TRACE("NOT adding partial",&prefix,false, i);
  582. FB_TRACE(u_errorName(status),&ustrs[i],false,i);
  583. }
  584. }
  585. }
  586. for(int i=0;i<subCount;i++) {
  587. if(partials[i]==0) {
  588. ustrs[i].reverse();
  589. builder->add(ustrs[i], kMATCH, status);
  590. revCount++;
  591. FB_TRACE(u_errorName(status), &ustrs[i], false, i);
  592. } else {
  593. FB_TRACE("Adding fwd",&ustrs[i], false, i);
  594. // an optimization would be to only add the portion after the '.'
  595. // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
  596. // instead of "Ph.D." since we already know the "Ph." part is a match.
  597. // would need the trie to be able to hold 0-length strings, though.
  598. builder2->add(ustrs[i], kMATCH, status); // forward
  599. fwdCount++;
  600. //ustrs[i].reverse();
  601. ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
  602. }
  603. }
  604. FB_TRACE("AbbrCount",nullptr,false, subCount);
  605. if(revCount>0) {
  606. backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
  607. if(U_FAILURE(status)) {
  608. FB_TRACE(u_errorName(status),nullptr,false, -1);
  609. return nullptr;
  610. }
  611. }
  612. if(fwdCount>0) {
  613. forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
  614. if(U_FAILURE(status)) {
  615. FB_TRACE(u_errorName(status),nullptr,false, -1);
  616. return nullptr;
  617. }
  618. }
  619. return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
  620. }
  621. // ----------- Base class implementation
  622. FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
  623. }
  624. FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
  625. }
  626. FilteredBreakIteratorBuilder *
  627. FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
  628. if(U_FAILURE(status)) return nullptr;
  629. LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
  630. return (U_SUCCESS(status))? ret.orphan(): nullptr;
  631. }
  632. FilteredBreakIteratorBuilder *
  633. FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) {
  634. return createEmptyInstance(status);
  635. }
  636. FilteredBreakIteratorBuilder *
  637. FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
  638. if(U_FAILURE(status)) return nullptr;
  639. LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
  640. return (U_SUCCESS(status))? ret.orphan(): nullptr;
  641. }
  642. U_NAMESPACE_END
  643. #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION