tupleset.h 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. #if !defined tupleset_h
  2. #define tupleset_h
  3. /**
  4. * This file is part of uhferret.
  5. * Initial triple map idea by Bob Dickerson.
  6. *
  7. * Author:: Peter Lane
  8. * Copyright:: Copyright 2011, Peter Lane.
  9. * License:: GPLv3
  10. *
  11. * uhferret is free software: you can redistribute it and/or modify
  12. * it under the terms of the GNU General Public License as published by
  13. * the Free Software Foundation, either version 3 of the License, or
  14. * (at your option) any later version.
  15. *
  16. * uhferret is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
  23. */
  24. #include <assert.h>
  25. #include <map>
  26. #include <string>
  27. #include <vector>
  28. #include "tokenset.h"
  29. /** TupleSet maintains the database mapping trigrams to identifier of documents which contain them.
  30. * The mapping is held as a sequence of std::maps, each map taking a std::size_t reference to
  31. * a token as a key. The end result of the three maps is a vector of document identifiers.
  32. *
  33. * The most important feature of the TupleSet is the collection of methods for iterating over
  34. * all tuples in the TupleSet.
  35. * e.g. with the definition: TupleSet tuple_set;
  36. * use: for (tuple_set.Begin (); tuple_set.HasMore (); tuple_set.GetNext ())
  37. * {}
  38. * to iterate over all the tuples. The methods: GetDocumentsForCurrentTuple, GetStringForCurrentTuple,
  39. * and GetToken0, GetToken1, GetToken2 return information on the current tuple.
  40. */
  41. class TupleSet
  42. {
  43. // typedef's to simplify declarations
  44. typedef std::map<std::size_t, std::vector<int> > WordMap;
  45. typedef WordMap::const_iterator WordMapIter;
  46. typedef std::map<std::size_t, WordMap> PairMap;
  47. typedef PairMap::const_iterator PairMapIter;
  48. typedef std::map<std::size_t, PairMap> TripMap;
  49. typedef TripMap::const_iterator TripMapIter;
  50. public:
  51. TupleSet ();
  52. void Clear ();
  53. int Size ();
  54. // given a tuple, return the list of documents which contain that tuple
  55. std::vector<int> & GetDocumentsForTuple (std::size_t token_0,
  56. std::size_t token_1, std::size_t token_2);
  57. // given a tuple and a document identifier,
  58. // - make sure that the document is in the list for that tuple
  59. // - returns true if the document was not already in trigram's list
  60. bool AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2,
  61. int document);
  62. // check if two documents share the given tuple
  63. bool IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
  64. // collect and return all tuples in the two given documents
  65. std::vector<std::string> CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset);
  66. private:
  67. TripMap _tuple_map;
  68. public: // following methods and data structures are to handle an iterator on tupleset
  69. void Begin (); // start the iterator
  70. void GetNext (); // advance the iterator
  71. bool HasMore () const; // check for end
  72. // retrieve current tuple's documents
  73. std::vector<int> & GetDocumentsForCurrentTuple ();
  74. // retrieve string for current tuple
  75. std::string GetStringForCurrentTuple (TokenSet & tokenset) const;
  76. // retrieve identifiers for individual tokens
  77. std::size_t GetToken (int i) const;
  78. private:
  79. TripMapIter _ti; // iterator from first token to pairs
  80. PairMapIter _pi; // iterator from second token to words
  81. WordMapIter _wi; // iterator from third token to document list
  82. };
  83. #endif