documentlist.h 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. #if !defined documentlist_h
  2. #define documentlist_h
  3. /**
  4. * This file is part of uhferret.
  5. *
  6. * Author:: Peter Lane
  7. * Copyright:: Copyright 2011, Peter Lane.
  8. * License:: GPLv3
  9. *
  10. * uhferret is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation, either version 3 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * uhferret is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
  22. */
  23. #include <assert.h>
  24. #include <vector>
  25. #include "tokenset.h"
  26. #include "tupleset.h"
  27. #include "document.h"
  28. /** DocumentList maintains a list of documents, a TokenSet of identified Tokens and
  29. * a TupleSet, which maps from sequences of three tokens to lists of documents
  30. * in which the trigrams were found.
  31. * -- Methods are provided to calculate information about pairs of documents,
  32. * such as Resemblance and Containment.
  33. * -- Note that the Documents are owned by this class although not created by it,
  34. * and hence all Documents are destroyed with the DocumentList.
  35. */
  36. class DocumentList
  37. {
  38. public:
  39. DocumentList ();
  40. ~DocumentList ();
  41. void AddDocument (std::string pathname, Document::DocumentType type = Document::TypeText);
  42. void AddDocument (std::string pathname, Document::DocumentType type, int id);
  43. Document * getDocument (std::size_t i) const;
  44. void RemoveDocument (Document * doc);
  45. TokenSet & GetTokenSet ();
  46. TupleSet & GetTupleSet ();
  47. void Clear ();
  48. int GetNewGroupId ();
  49. void ResetReading ();
  50. int Size () const;
  51. int NumberOfPairs () const;
  52. void RunFerret (int first_document = 0);
  53. void ReadDocument (int i);
  54. void ClearSimilarities ();
  55. void ComputeSimilarities ();
  56. int GetTotalTrigramCount ();
  57. int CountTrigrams (int doc_i);
  58. int CountMatches (int doc_i, int doc_j);
  59. float ComputeResemblance (int doc_i, int doc_j);
  60. float ComputeContainment (int doc_i, int doc_j);
  61. // check if given trigram is in both the indexed documents
  62. bool IsMatchingTrigram (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
  63. // convert given trigram into a string
  64. std::string MakeTrigramString (std::size_t t0, std::size_t t1, std::size_t t2);
  65. // collect all the matching trigrams in the two documents into a vector of strings
  66. std::vector<std::string> CollectMatchingTrigrams (int doc1, int doc2);
  67. private:
  68. std::vector<Document *> _documents;
  69. TokenSet _token_set;
  70. TupleSet _tuple_set;
  71. std::vector<int> _matches;
  72. int _last_group_id;
  73. };
  74. #endif