peterlane
/
uhferret-gem


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
							#if !defined tupleset_h
#define tupleset_h

/** 
 * This file is part of uhferret.
 * Initial triple map idea by Bob Dickerson.
 *
 * Author::    Peter Lane
 * Copyright:: Copyright 2011, Peter Lane.
 * License::   GPLv3
 *
 * uhferret is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * uhferret is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <assert.h>
#include <map>
#include <string>
#include <vector>

#include "tokenset.h"

/** TupleSet maintains the database mapping trigrams to identifier of documents which contain them.
  * The mapping is held as a sequence of std::maps, each map taking a std::size_t reference to 
  * a token as a key.  The end result of the three maps is a vector of document identifiers.
  *
  * The most important feature of the TupleSet is the collection of methods for iterating over 
  * all tuples in the TupleSet.
  * e.g. with the definition:  TupleSet tuple_set; 
  * use:                       for (tuple_set.Begin (); tuple_set.HasMore (); tuple_set.GetNext ())
  *                            {}
  * to iterate over all the tuples.  The methods: GetDocumentsForCurrentTuple, GetStringForCurrentTuple,
  * and GetToken0, GetToken1, GetToken2 return information on the current tuple.
  */
class TupleSet
{
	// typedef's to simplify declarations
	typedef std::map<std::size_t, std::vector<int> > WordMap;
	typedef WordMap::const_iterator WordMapIter;

	typedef std::map<std::size_t, WordMap> PairMap;
	typedef PairMap::const_iterator PairMapIter;

	typedef std::map<std::size_t, PairMap> TripMap;
	typedef TripMap::const_iterator TripMapIter;

	public:
		TupleSet ();
		void Clear ();
		int Size ();
		// given a tuple, return the list of documents which contain that tuple
		std::vector<int> & GetDocumentsForTuple (std::size_t token_0, 
				std::size_t token_1, std::size_t token_2);
		// given a tuple and a document identifier, 
		// - make sure that the document is in the list for that tuple
		// - returns true if the document was not already in trigram's list
		bool AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2, 
				int document);
		// check if two documents share the given tuple
		bool IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
		// collect and return all tuples in the two given documents
    std::vector<std::string> CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset);
	private:
		TripMap	_tuple_map;
	public: // following methods and data structures are to handle an iterator on tupleset
		void Begin ();			// start the iterator
		void GetNext ();		// advance the iterator
		bool HasMore () const;		// check for end
		// retrieve current tuple's documents
		std::vector<int> & GetDocumentsForCurrentTuple ();	
		// retrieve string for current tuple
    std::string GetStringForCurrentTuple (TokenSet & tokenset) const;	
		// retrieve identifiers for individual tokens
		std::size_t GetToken (int i) const;
	private:
		TripMapIter	_ti;	// iterator from first token to pairs
		PairMapIter	_pi;	// iterator from second token to words
		WordMapIter	_wi;	// iterator from third token to document list
};

#endif