123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- #if !defined documentlist_h
- #define documentlist_h
- /**
- * This file is part of uhferret.
- *
- * Author:: Peter Lane
- * Copyright:: Copyright 2011, Peter Lane.
- * License:: GPLv3
- *
- * uhferret is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * uhferret is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
- */
- #include <assert.h>
- #include <vector>
- #include "tokenset.h"
- #include "tupleset.h"
- #include "document.h"
- /** DocumentList maintains a list of documents, a TokenSet of identified Tokens and
- * a TupleSet, which maps from sequences of three tokens to lists of documents
- * in which the trigrams were found.
- * -- Methods are provided to calculate information about pairs of documents,
- * such as Resemblance and Containment.
- * -- Note that the Documents are owned by this class although not created by it,
- * and hence all Documents are destroyed with the DocumentList.
- */
- class DocumentList
- {
- public:
- DocumentList ();
- ~DocumentList ();
- void AddDocument (std::string pathname, Document::DocumentType type = Document::TypeText);
- void AddDocument (std::string pathname, Document::DocumentType type, int id);
- Document * getDocument (std::size_t i) const;
- void RemoveDocument (Document * doc);
- TokenSet & GetTokenSet ();
- TupleSet & GetTupleSet ();
- void Clear ();
- int GetNewGroupId ();
- void ResetReading ();
- int Size () const;
- int NumberOfPairs () const;
- void RunFerret (int first_document = 0);
- void ReadDocument (int i);
- void ClearSimilarities ();
- void ComputeSimilarities ();
- int GetTotalTrigramCount ();
- int CountTrigrams (int doc_i);
- int CountMatches (int doc_i, int doc_j);
- float ComputeResemblance (int doc_i, int doc_j);
- float ComputeContainment (int doc_i, int doc_j);
- // check if given trigram is in both the indexed documents
- bool IsMatchingTrigram (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
- // convert given trigram into a string
- std::string MakeTrigramString (std::size_t t0, std::size_t t1, std::size_t t2);
- // collect all the matching trigrams in the two documents into a vector of strings
- std::vector<std::string> CollectMatchingTrigrams (int doc1, int doc2);
- private:
- std::vector<Document *> _documents;
- TokenSet _token_set;
- TupleSet _tuple_set;
- std::vector<int> _matches;
- int _last_group_id;
- };
- #endif
|