123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- #if !defined tupleset_h
- #define tupleset_h
- /**
- * This file is part of uhferret.
- * Initial triple map idea by Bob Dickerson.
- *
- * Author:: Peter Lane
- * Copyright:: Copyright 2011, Peter Lane.
- * License:: GPLv3
- *
- * uhferret is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * uhferret is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
- */
- #include <assert.h>
- #include <map>
- #include <string>
- #include <vector>
- #include "tokenset.h"
- /** TupleSet maintains the database mapping trigrams to identifier of documents which contain them.
- * The mapping is held as a sequence of std::maps, each map taking a std::size_t reference to
- * a token as a key. The end result of the three maps is a vector of document identifiers.
- *
- * The most important feature of the TupleSet is the collection of methods for iterating over
- * all tuples in the TupleSet.
- * e.g. with the definition: TupleSet tuple_set;
- * use: for (tuple_set.Begin (); tuple_set.HasMore (); tuple_set.GetNext ())
- * {}
- * to iterate over all the tuples. The methods: GetDocumentsForCurrentTuple, GetStringForCurrentTuple,
- * and GetToken0, GetToken1, GetToken2 return information on the current tuple.
- */
- class TupleSet
- {
- // typedef's to simplify declarations
- typedef std::map<std::size_t, std::vector<int> > WordMap;
- typedef WordMap::const_iterator WordMapIter;
- typedef std::map<std::size_t, WordMap> PairMap;
- typedef PairMap::const_iterator PairMapIter;
- typedef std::map<std::size_t, PairMap> TripMap;
- typedef TripMap::const_iterator TripMapIter;
- public:
- TupleSet ();
- void Clear ();
- int Size ();
- // given a tuple, return the list of documents which contain that tuple
- std::vector<int> & GetDocumentsForTuple (std::size_t token_0,
- std::size_t token_1, std::size_t token_2);
- // given a tuple and a document identifier,
- // - make sure that the document is in the list for that tuple
- // - returns true if the document was not already in trigram's list
- bool AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2,
- int document);
- // check if two documents share the given tuple
- bool IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
- // collect and return all tuples in the two given documents
- std::vector<std::string> CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset);
- private:
- TripMap _tuple_map;
- public: // following methods and data structures are to handle an iterator on tupleset
- void Begin (); // start the iterator
- void GetNext (); // advance the iterator
- bool HasMore () const; // check for end
- // retrieve current tuple's documents
- std::vector<int> & GetDocumentsForCurrentTuple ();
- // retrieve string for current tuple
- std::string GetStringForCurrentTuple (TokenSet & tokenset) const;
- // retrieve identifiers for individual tokens
- std::size_t GetToken (int i) const;
- private:
- TripMapIter _ti; // iterator from first token to pairs
- PairMapIter _pi; // iterator from second token to words
- WordMapIter _wi; // iterator from third token to document list
- };
- #endif
|