tupleset.cpp 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. #include "tupleset.h"
  2. /**
  3. * This file is part of uhferret.
  4. *
  5. * Author:: Peter Lane
  6. * Copyright:: Copyright 2011, Peter Lane.
  7. * License:: GPLv3
  8. *
  9. * uhferret is free software: you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation, either version 3 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * uhferret is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
  21. */
  22. TupleSet::TupleSet ()
  23. {}
  24. void TupleSet::Clear ()
  25. {
  26. _tuple_map.clear ();
  27. }
  28. int TupleSet::Size ()
  29. {
  30. int trigram_count = 0;
  31. for (Begin (); HasMore (); GetNext ())
  32. {
  33. trigram_count++;
  34. }
  35. return trigram_count;
  36. }
  37. std::vector<int> & TupleSet::GetDocumentsForTuple (std::size_t token_0, std::size_t token_1, std::size_t token_2)
  38. {
  39. return _tuple_map[token_0][token_1][token_2];
  40. }
  41. bool TupleSet::AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2, int document)
  42. {
  43. bool has_doc = false;
  44. std::vector<int> & fvector = _tuple_map[token_0][token_1][token_2];
  45. // check if document is already in the trigram
  46. for (int i = 0, n = fvector.size(); i < n; ++i)
  47. {
  48. if (fvector[i] == document)
  49. {
  50. has_doc = true;
  51. break;
  52. }
  53. }
  54. if (!has_doc) // didn't have document, so add it
  55. {
  56. fvector.push_back (document);
  57. return true; // indicate that document added
  58. }
  59. return false;
  60. }
  61. bool TupleSet::IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
  62. {
  63. std::vector<int> fvector = GetDocumentsForTuple (t0, t1, t2);
  64. bool has_doc1 = false;
  65. bool has_doc2 = false;
  66. for (int i=0, n=fvector.size(); i<n; ++i)
  67. {
  68. if (fvector[i] == doc1) has_doc1 = true;
  69. if (fvector[i] == doc2) has_doc2 = true;
  70. }
  71. return ( has_doc1 && has_doc2 );
  72. }
  73. std::vector<std::string> TupleSet::CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset)
  74. {
  75. std::vector<std::string> tuples;
  76. for (Begin (); HasMore (); GetNext ())
  77. {
  78. if (IsMatchingTuple (GetToken (0), GetToken (1), GetToken (2),
  79. doc1, doc2))
  80. {
  81. tuples.push_back (GetStringForCurrentTuple (tokenset));
  82. }
  83. }
  84. return tuples;
  85. }
  86. void TupleSet::Begin ()
  87. {
  88. _ti = _tuple_map.begin ();
  89. _pi = (_ti->second).begin ();
  90. _wi = (_pi->second).begin ();
  91. }
  92. void TupleSet::GetNext ()
  93. {
  94. _wi++; // move to next word position
  95. if (_wi == (_pi->second).end ()) // if words have finished, then move to next pair position
  96. {
  97. _pi++;
  98. if (_pi == (_ti->second).end ()) // if pairs have finished, then move to next triple position
  99. {
  100. _ti++;
  101. if (_ti == _tuple_map.end ()) return; // finished
  102. _pi = (_ti->second).begin (); // get next pair iterator
  103. }
  104. _wi = (_pi->second).begin (); // get next word iterator
  105. }
  106. }
  107. bool TupleSet::HasMore () const
  108. {
  109. return _ti != _tuple_map.end ();
  110. }
  111. std::vector<int> & TupleSet::GetDocumentsForCurrentTuple ()
  112. {
  113. return GetDocumentsForTuple (_ti->first, _pi->first, _wi->first);
  114. }
  115. std::string TupleSet::GetStringForCurrentTuple (TokenSet & tokenset) const
  116. {
  117. std::string tuple = "";
  118. tuple += tokenset.GetStringFor (_ti->first);
  119. tuple += " " + tokenset.GetStringFor (_pi->first);
  120. tuple += " " + tokenset.GetStringFor (_wi->first);
  121. return tuple;
  122. }
  123. std::size_t TupleSet::GetToken (int i) const
  124. {
  125. assert (i>=0 && i<=2);
  126. if (i == 0)
  127. return _ti->first;
  128. else if (i == 1)
  129. return _pi->first;
  130. else // if (i == 2)
  131. return _wi->first;
  132. }