123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- * Copyright (C) 2014-2015, International Business Machines Corporation and
- * others. All Rights Reserved.
- *******************************************************************************
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
- #include "cmemory.h"
- #include "unicode/filteredbrk.h"
- #include "unicode/ucharstriebuilder.h"
- #include "unicode/ures.h"
- #include "uresimp.h" // ures_getByKeyWithFallback
- #include "ubrkimpl.h" // U_ICUDATA_BRKITR
- #include "uvector.h"
- #include "cmemory.h"
- #include "umutex.h"
- U_NAMESPACE_BEGIN
- #ifndef FB_DEBUG
- #define FB_DEBUG 0
- #endif
- #if FB_DEBUG
- #include <stdio.h>
- static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
- char buf[2048];
- if(s) {
- s->extract(0,s->length(),buf,2048);
- } else {
- strcpy(buf,"nullptr");
- }
- fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
- f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
- }
- #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
- #else
- #define FB_TRACE(m,s,b,d)
- #endif
- /**
- * Used with sortedInsert()
- */
- static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
- const UnicodeString &a = *(const UnicodeString*)t1.pointer;
- const UnicodeString &b = *(const UnicodeString*)t2.pointer;
- return a.compare(b);
- }
- /**
- * A UVector which implements a set of strings.
- */
- class UStringSet : public UVector {
- public:
- UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
- uhash_compareUnicodeString,
- 1,
- status) {}
- virtual ~UStringSet();
- /**
- * Is this UnicodeSet contained?
- */
- inline UBool contains(const UnicodeString& s) {
- return contains((void*) &s);
- }
- using UVector::contains;
- /**
- * Return the ith UnicodeString alias
- */
- inline const UnicodeString* getStringAt(int32_t i) const {
- return (const UnicodeString*)elementAt(i);
- }
- /**
- * Adopt the UnicodeString if not already contained.
- * Caller no longer owns the pointer in any case.
- * @return true if adopted successfully, false otherwise (error, or else duplicate)
- */
- inline UBool adopt(UnicodeString *str, UErrorCode &status) {
- if(U_FAILURE(status) || contains(*str)) {
- delete str;
- return false;
- } else {
- sortedInsert(str, compareUnicodeString, status);
- if(U_FAILURE(status)) {
- return false;
- }
- return true;
- }
- }
- /**
- * Add by value.
- * @return true if successfully adopted.
- */
- inline UBool add(const UnicodeString& str, UErrorCode &status) {
- if(U_FAILURE(status)) return false;
- UnicodeString *t = new UnicodeString(str);
- if(t==nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR; return false;
- }
- return adopt(t, status);
- }
- /**
- * Remove this string.
- * @return true if successfully removed, false otherwise (error, or else it wasn't there)
- */
- inline UBool remove(const UnicodeString &s, UErrorCode &status) {
- if(U_FAILURE(status)) return false;
- return removeElement((void*) &s);
- }
- };
- /**
- * Virtual, won't be inlined
- */
- UStringSet::~UStringSet() {}
- /* ----------------------------------------------------------- */
- /* Filtered Break constants */
- static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
- static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
- static const int32_t kSuppressInReverse = (1<<0);
- static const int32_t kAddToForward = (1<<1);
- static const char16_t kFULLSTOP = 0x002E; // '.'
- /**
- * Shared data for SimpleFilteredSentenceBreakIterator
- */
- class SimpleFilteredSentenceBreakData : public UMemory {
- public:
- SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
- : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
- SimpleFilteredSentenceBreakData *incr() {
- umtx_atomic_inc(&refcount);
- return this;
- }
- SimpleFilteredSentenceBreakData *decr() {
- if(umtx_atomic_dec(&refcount) <= 0) {
- delete this;
- }
- return nullptr;
- }
- virtual ~SimpleFilteredSentenceBreakData();
- bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
- bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
- const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
- const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
- private:
- // These tries own their data arrays.
- // They are shared and must therefore not be modified.
- LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
- LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
- u_atomic_int32_t refcount;
- };
- SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
- /**
- * Concrete implementation
- */
- class SimpleFilteredSentenceBreakIterator : public BreakIterator {
- public:
- SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
- SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
- virtual ~SimpleFilteredSentenceBreakIterator();
- private:
- SimpleFilteredSentenceBreakData *fData;
- LocalPointer<BreakIterator> fDelegate;
- LocalUTextPointer fText;
- /* -- subclass interface -- */
- public:
- /* -- cloning and other subclass stuff -- */
- virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
- int32_t &/*BufferSize*/,
- UErrorCode &status) override {
- // for now - always deep clone
- status = U_SAFECLONE_ALLOCATED_WARNING;
- return clone();
- }
- virtual SimpleFilteredSentenceBreakIterator* clone() const override { return new SimpleFilteredSentenceBreakIterator(*this); }
- virtual UClassID getDynamicClassID() const override { return nullptr; }
- virtual bool operator==(const BreakIterator& o) const override { if(this==&o) return true; return false; }
- /* -- text modifying -- */
- virtual void setText(UText *text, UErrorCode &status) override { fDelegate->setText(text,status); }
- virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) override { fDelegate->refreshInputText(input,status); return *this; }
- virtual void adoptText(CharacterIterator* it) override { fDelegate->adoptText(it); }
- virtual void setText(const UnicodeString &text) override { fDelegate->setText(text); }
- /* -- other functions that are just delegated -- */
- virtual UText *getUText(UText *fillIn, UErrorCode &status) const override { return fDelegate->getUText(fillIn,status); }
- virtual CharacterIterator& getText() const override { return fDelegate->getText(); }
- /* -- ITERATION -- */
- virtual int32_t first() override;
- virtual int32_t preceding(int32_t offset) override;
- virtual int32_t previous() override;
- virtual UBool isBoundary(int32_t offset) override;
- virtual int32_t current() const override { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
- virtual int32_t next() override;
- virtual int32_t next(int32_t n) override;
- virtual int32_t following(int32_t offset) override;
- virtual int32_t last() override;
- private:
- /**
- * Given that the fDelegate has already given its "initial" answer,
- * find the NEXT actual (non-excepted) break.
- * @param n initial position from delegate
- * @return new break position or UBRK_DONE
- */
- int32_t internalNext(int32_t n);
- /**
- * Given that the fDelegate has already given its "initial" answer,
- * find the PREV actual (non-excepted) break.
- * @param n initial position from delegate
- * @return new break position or UBRK_DONE
- */
- int32_t internalPrev(int32_t n);
- /**
- * set up the UText with the value of the fDelegate.
- * Call this before calling breakExceptionAt.
- * May be able to avoid excess calls
- */
- void resetState(UErrorCode &status);
- /**
- * Is there a match (exception) at this spot?
- */
- enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
- /**
- * Determine if there is an exception at this spot
- * @param n spot to check
- * @return kNoExceptionHere or kExceptionHere
- **/
- enum EFBMatchResult breakExceptionAt(int32_t n);
- };
- SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
- : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
- {
- }
- SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
- BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
- fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
- fDelegate(adopt)
- {
- if (fData == nullptr) {
- delete forwards;
- delete backwards;
- if (U_SUCCESS(status)) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
- }
- }
- SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
- fData = fData->decr();
- }
- void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
- fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
- }
- SimpleFilteredSentenceBreakIterator::EFBMatchResult
- SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
- int64_t bestPosn = -1;
- int32_t bestValue = -1;
- // loops while 'n' points to an exception.
- utext_setNativeIndex(fText.getAlias(), n); // from n..
- //if(debug2) u_printf(" n@ %d\n", n);
- // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
- if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here??
- // TODO only do this the 1st time?
- //if(debug2) u_printf("skipping prev: |%C| \n", (char16_t)uch);
- } else {
- //if(debug2) u_printf("not skipping prev: |%C| \n", (char16_t)uch);
- utext_next32(fText.getAlias());
- //if(debug2) u_printf(" -> : |%C| \n", (char16_t)uch);
- }
- {
- // Do not modify the shared trie!
- UCharsTrie iter(fData->getBackwardsTrie());
- UChar32 uch;
- while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards
- UStringTrieResult r = iter.nextForCodePoint(uch);
- if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
- bestPosn = utext_getNativeIndex(fText.getAlias());
- bestValue = iter.getValue();
- }
- if(!USTRINGTRIE_HAS_NEXT(r)) {
- break;
- }
- //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (char16_t)uch, r, utext_getNativeIndex(fText.getAlias()));
- }
- }
- //if(bestValue >= 0) {
- //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
- //}
- if(bestPosn>=0) {
- //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
- //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
- //int32_t bestValue = iter.getValue();
- ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (char16_t)uch, r, bestValue);
- if(bestValue == kMATCH) { // exact match!
- //if(debug2) u_printf(" exact backward match\n");
- return kExceptionHere; // See if the next is another exception.
- } else if(bestValue == kPARTIAL
- && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
- //if(debug2) u_printf(" partial backward match\n");
- // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
- // to see if it matches something going forward.
- UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
- utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
- //if(debug2) u_printf("Retrying at %d\n", bestPosn);
- // Do not modify the shared trie!
- UCharsTrie iter(fData->getForwardsPartialTrie());
- UChar32 uch;
- while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
- USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
- //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (char16_t)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
- }
- if(USTRINGTRIE_MATCHES(rfwd)) {
- //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (char16_t)uch);
- // only full matches here, nothing to check
- // skip the next:
- return kExceptionHere;
- } else {
- //if(debug2) u_printf("fwd> /%C/ no match.\n", (char16_t)uch);
- // no match (no exception) -return the 'underlying' break
- return kNoExceptionHere;
- }
- } else {
- return kNoExceptionHere; // internal error and/or no forwards trie
- }
- } else {
- //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (char16_t)uch, r); // no best match
- return kNoExceptionHere; // No match - so exit. Not an exception.
- }
- }
- // the workhorse single next.
- int32_t
- SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
- if(n == UBRK_DONE || // at end or
- !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
- return n;
- }
- // OK, do we need to break here?
- UErrorCode status = U_ZERO_ERROR;
- // refresh text
- resetState(status);
- if(U_FAILURE(status)) return UBRK_DONE; // bail out
- int64_t utextLen = utext_nativeLength(fText.getAlias());
- //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
- while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
- SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
- switch(m) {
- case kExceptionHere:
- n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
- continue;
- default:
- case kNoExceptionHere:
- return n;
- }
- }
- return n;
- }
- int32_t
- SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
- if(n == 0 || n == UBRK_DONE || // at end or
- !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
- return n;
- }
- // OK, do we need to break here?
- UErrorCode status = U_ZERO_ERROR;
- // refresh text
- resetState(status);
- if(U_FAILURE(status)) return UBRK_DONE; // bail out
- //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
- while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
- SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
- switch(m) {
- case kExceptionHere:
- n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
- continue;
- default:
- case kNoExceptionHere:
- return n;
- }
- }
- return n;
- }
- int32_t
- SimpleFilteredSentenceBreakIterator::next() {
- return internalNext(fDelegate->next());
- }
- int32_t
- SimpleFilteredSentenceBreakIterator::first() {
- // Don't suppress a break opportunity at the beginning of text.
- return fDelegate->first();
- }
- int32_t
- SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
- return internalPrev(fDelegate->preceding(offset));
- }
- int32_t
- SimpleFilteredSentenceBreakIterator::previous() {
- return internalPrev(fDelegate->previous());
- }
- UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
- if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
- if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
- UErrorCode status = U_ZERO_ERROR;
- resetState(status);
- SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
- switch(m) {
- case kExceptionHere:
- return false;
- default:
- case kNoExceptionHere:
- return true;
- }
- }
-
- int32_t
- SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
- return internalNext(fDelegate->next(offset));
- }
- int32_t
- SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
- return internalNext(fDelegate->following(offset));
- }
- int32_t
- SimpleFilteredSentenceBreakIterator::last() {
- // Don't suppress a break opportunity at the end of text.
- return fDelegate->last();
- }
- /**
- * Concrete implementation of builder class.
- */
- class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
- public:
- virtual ~SimpleFilteredBreakIteratorBuilder();
- SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
- SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
- virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
- virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
- virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) override;
- private:
- UStringSet fSet;
- };
- SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
- {
- }
- SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
- : fSet(status)
- {
- }
- SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
- : fSet(status)
- {
- if(U_SUCCESS(status)) {
- UErrorCode subStatus = U_ZERO_ERROR;
- LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
- if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
- status = subStatus; // copy the failing status
- #if FB_DEBUG
- fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
- #endif
- return; // leaves the builder empty, if you try to use it.
- }
- LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", nullptr, &subStatus));
- if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
- status = subStatus; // copy the failing status
- #if FB_DEBUG
- fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
- #endif
- return; // leaves the builder empty, if you try to use it.
- }
- LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", nullptr, &subStatus));
- #if FB_DEBUG
- {
- UErrorCode subsub = subStatus;
- fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
- }
- #endif
-
- if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
- status = subStatus; // copy the failing status
- #if FB_DEBUG
- fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
- #endif
- return; // leaves the builder empty, if you try to use it.
- }
- LocalUResourceBundlePointer strs;
- subStatus = status; // Pick up inherited warning status now
- do {
- strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
- if(strs.isValid() && U_SUCCESS(subStatus)) {
- UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
- suppressBreakAfter(str, status); // load the string
- }
- } while (strs.isValid() && U_SUCCESS(subStatus));
- if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
- status = subStatus;
- }
- }
- }
- UBool
- SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
- {
- UBool r = fSet.add(exception, status);
- FB_TRACE("suppressBreakAfter",&exception,r,0);
- return r;
- }
- UBool
- SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
- {
- UBool r = fSet.remove(exception, status);
- FB_TRACE("unsuppressBreakAfter",&exception,r,0);
- return r;
- }
- /**
- * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
- * Work around this.
- *
- * Note: "new UnicodeString[subCount]" ends up calling global operator new
- * on MSVC2012 for some reason.
- */
- static inline UnicodeString* newUnicodeStringArray(size_t count) {
- return new UnicodeString[count ? count : 1];
- }
- BreakIterator *
- SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
- LocalPointer<BreakIterator> adopt(adoptBreakIterator);
- LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
- LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
- if(U_FAILURE(status)) {
- return nullptr;
- }
- int32_t revCount = 0;
- int32_t fwdCount = 0;
- int32_t subCount = fSet.size();
- UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
-
- LocalArray<UnicodeString> ustrs(ustrs_ptr);
- LocalMemory<int> partials;
- partials.allocateInsteadAndReset(subCount);
- LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
- LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
- int n=0;
- for ( int32_t i = 0;
- i<fSet.size();
- i++) {
- const UnicodeString *abbr = fSet.getStringAt(i);
- if(abbr) {
- FB_TRACE("build",abbr,true,i);
- ustrs[n] = *abbr; // copy by value
- FB_TRACE("ustrs[n]",&ustrs[n],true,i);
- } else {
- FB_TRACE("build",abbr,false,i);
- status = U_MEMORY_ALLOCATION_ERROR;
- return nullptr;
- }
- partials[n] = 0; // default: not partial
- n++;
- }
- // first pass - find partials.
- for(int i=0;i<subCount;i++) {
- int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
- if(nn>-1 && (nn+1)!=ustrs[i].length()) {
- FB_TRACE("partial",&ustrs[i],false,i);
- // is partial.
- // is it unique?
- int sameAs = -1;
- for(int j=0;j<subCount;j++) {
- if(j==i) continue;
- if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
- FB_TRACE("prefix",&ustrs[j],false,nn+1);
- //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
- if(partials[j]==0) { // hasn't been processed yet
- partials[j] = kSuppressInReverse | kAddToForward;
- FB_TRACE("suppressing",&ustrs[j],false,j);
- } else if(partials[j] & kSuppressInReverse) {
- sameAs = j; // the other entry is already in the reverse table.
- }
- }
- }
- FB_TRACE("for partial same-",&ustrs[i],false,sameAs);
- FB_TRACE(" == partial #",&ustrs[i],false,partials[i]);
- UnicodeString prefix(ustrs[i], 0, nn+1);
- if(sameAs == -1 && partials[i] == 0) {
- // first one - add the prefix to the reverse table.
- prefix.reverse();
- builder->add(prefix, kPARTIAL, status);
- revCount++;
- FB_TRACE("Added partial",&prefix,false, i);
- FB_TRACE(u_errorName(status),&ustrs[i],false,i);
- partials[i] = kSuppressInReverse | kAddToForward;
- } else {
- FB_TRACE("NOT adding partial",&prefix,false, i);
- FB_TRACE(u_errorName(status),&ustrs[i],false,i);
- }
- }
- }
- for(int i=0;i<subCount;i++) {
- if(partials[i]==0) {
- ustrs[i].reverse();
- builder->add(ustrs[i], kMATCH, status);
- revCount++;
- FB_TRACE(u_errorName(status), &ustrs[i], false, i);
- } else {
- FB_TRACE("Adding fwd",&ustrs[i], false, i);
- // an optimization would be to only add the portion after the '.'
- // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
- // instead of "Ph.D." since we already know the "Ph." part is a match.
- // would need the trie to be able to hold 0-length strings, though.
- builder2->add(ustrs[i], kMATCH, status); // forward
- fwdCount++;
- //ustrs[i].reverse();
- ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
- }
- }
- FB_TRACE("AbbrCount",nullptr,false, subCount);
- if(revCount>0) {
- backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
- if(U_FAILURE(status)) {
- FB_TRACE(u_errorName(status),nullptr,false, -1);
- return nullptr;
- }
- }
- if(fwdCount>0) {
- forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
- if(U_FAILURE(status)) {
- FB_TRACE(u_errorName(status),nullptr,false, -1);
- return nullptr;
- }
- }
- return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
- }
- // ----------- Base class implementation
- FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
- }
- FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
- }
- FilteredBreakIteratorBuilder *
- FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
- if(U_FAILURE(status)) return nullptr;
- LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
- return (U_SUCCESS(status))? ret.orphan(): nullptr;
- }
- FilteredBreakIteratorBuilder *
- FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) {
- return createEmptyInstance(status);
- }
- FilteredBreakIteratorBuilder *
- FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
- if(U_FAILURE(status)) return nullptr;
- LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
- return (U_SUCCESS(status))? ret.orphan(): nullptr;
- }
- U_NAMESPACE_END
- #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
|