123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 |
- #include "tupleset.h"
- /**
- * This file is part of uhferret.
- *
- * Author:: Peter Lane
- * Copyright:: Copyright 2011, Peter Lane.
- * License:: GPLv3
- *
- * uhferret is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * uhferret is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
- */
- TupleSet::TupleSet ()
- {}
- void TupleSet::Clear ()
- {
- _tuple_map.clear ();
- }
- int TupleSet::Size ()
- {
- int trigram_count = 0;
- for (Begin (); HasMore (); GetNext ())
- {
- trigram_count++;
- }
- return trigram_count;
- }
- std::vector<int> & TupleSet::GetDocumentsForTuple (std::size_t token_0, std::size_t token_1, std::size_t token_2)
- {
- return _tuple_map[token_0][token_1][token_2];
- }
- bool TupleSet::AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2, int document)
- {
- bool has_doc = false;
- std::vector<int> & fvector = _tuple_map[token_0][token_1][token_2];
- // check if document is already in the trigram
- for (int i = 0, n = fvector.size(); i < n; ++i)
- {
- if (fvector[i] == document)
- {
- has_doc = true;
- break;
- }
- }
- if (!has_doc) // didn't have document, so add it
- {
- fvector.push_back (document);
- return true; // indicate that document added
- }
- return false;
- }
- bool TupleSet::IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
- {
- std::vector<int> fvector = GetDocumentsForTuple (t0, t1, t2);
- bool has_doc1 = false;
- bool has_doc2 = false;
- for (int i=0, n=fvector.size(); i<n; ++i)
- {
- if (fvector[i] == doc1) has_doc1 = true;
- if (fvector[i] == doc2) has_doc2 = true;
- }
- return ( has_doc1 && has_doc2 );
- }
- std::vector<std::string> TupleSet::CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset)
- {
- std::vector<std::string> tuples;
- for (Begin (); HasMore (); GetNext ())
- {
- if (IsMatchingTuple (GetToken (0), GetToken (1), GetToken (2),
- doc1, doc2))
- {
- tuples.push_back (GetStringForCurrentTuple (tokenset));
- }
- }
- return tuples;
- }
- void TupleSet::Begin ()
- {
- _ti = _tuple_map.begin ();
- _pi = (_ti->second).begin ();
- _wi = (_pi->second).begin ();
- }
- void TupleSet::GetNext ()
- {
- _wi++; // move to next word position
- if (_wi == (_pi->second).end ()) // if words have finished, then move to next pair position
- {
- _pi++;
- if (_pi == (_ti->second).end ()) // if pairs have finished, then move to next triple position
- {
- _ti++;
- if (_ti == _tuple_map.end ()) return; // finished
- _pi = (_ti->second).begin (); // get next pair iterator
- }
- _wi = (_pi->second).begin (); // get next word iterator
- }
- }
- bool TupleSet::HasMore () const
- {
- return _ti != _tuple_map.end ();
- }
- std::vector<int> & TupleSet::GetDocumentsForCurrentTuple ()
- {
- return GetDocumentsForTuple (_ti->first, _pi->first, _wi->first);
- }
- std::string TupleSet::GetStringForCurrentTuple (TokenSet & tokenset) const
- {
- std::string tuple = "";
- tuple += tokenset.GetStringFor (_ti->first);
- tuple += " " + tokenset.GetStringFor (_pi->first);
- tuple += " " + tokenset.GetStringFor (_wi->first);
-
- return tuple;
- }
- std::size_t TupleSet::GetToken (int i) const
- {
- assert (i>=0 && i<=2);
- if (i == 0)
- return _ti->first;
- else if (i == 1)
- return _pi->first;
- else // if (i == 2)
- return _wi->first;
- }
|