peterlane
/
uhferret-gem


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
							#include "tupleset.h"

/** 
 * This file is part of uhferret.
 *
 * Author::    Peter Lane
 * Copyright:: Copyright 2011, Peter Lane.
 * License::   GPLv3
 *
 * uhferret is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * uhferret is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
 */

TupleSet::TupleSet ()
{}

void TupleSet::Clear ()
{
	_tuple_map.clear ();
}

int TupleSet::Size ()
{
	int trigram_count = 0;
	for (Begin (); HasMore (); GetNext ())
	{
		trigram_count++;
	}

	return trigram_count;
}

std::vector<int> & TupleSet::GetDocumentsForTuple (std::size_t token_0, std::size_t token_1, std::size_t token_2)
{
	return _tuple_map[token_0][token_1][token_2];
}

bool TupleSet::AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2, int document)
{
	bool has_doc = false;
	std::vector<int> & fvector = _tuple_map[token_0][token_1][token_2];
	// check if document is already in the trigram
	for (int i = 0, n = fvector.size(); i < n; ++i)
	{
		if (fvector[i] == document)
		{
			has_doc = true;
			break;
		}
	}

	if (!has_doc) // didn't have document, so add it
	{
		fvector.push_back (document);
		return true;  // indicate that document added
	}
	return false;
}

bool TupleSet::IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
{
	std::vector<int> fvector = GetDocumentsForTuple (t0, t1, t2);
	bool has_doc1 = false;
	bool has_doc2 = false;
	for (int i=0, n=fvector.size(); i<n; ++i)
	{
		if (fvector[i] == doc1) has_doc1 = true;
		if (fvector[i] == doc2) has_doc2 = true;
	}
	return ( has_doc1 && has_doc2 );
}

std::vector<std::string> TupleSet::CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset)
{
  std::vector<std::string> tuples;
	for (Begin (); HasMore (); GetNext ())
	{
		if (IsMatchingTuple (GetToken (0), GetToken (1), GetToken (2), 
					doc1, doc2))
		{
			tuples.push_back (GetStringForCurrentTuple (tokenset));
		}
	}
	return tuples;
}

void TupleSet::Begin ()
{
	_ti = _tuple_map.begin ();
	_pi = (_ti->second).begin ();
	_wi = (_pi->second).begin ();
}

void TupleSet::GetNext ()
{
	_wi++; // move to next word position
	if (_wi == (_pi->second).end ())  // if words have finished, then move to next pair position
	{
		_pi++;
		if (_pi == (_ti->second).end ()) // if pairs have finished, then move to next triple position
		{
			_ti++;
			if (_ti == _tuple_map.end ()) return; // finished
			_pi = (_ti->second).begin ();  // get next pair iterator
		}
		_wi = (_pi->second).begin ();  // get next word iterator
	}
}

bool TupleSet::HasMore () const
{
	return _ti != _tuple_map.end ();
}

std::vector<int> & TupleSet::GetDocumentsForCurrentTuple ()
{
	return GetDocumentsForTuple (_ti->first, _pi->first, _wi->first);
}

std::string TupleSet::GetStringForCurrentTuple (TokenSet & tokenset) const
{
  std::string tuple = "";
	tuple += tokenset.GetStringFor (_ti->first);
	tuple += " " + tokenset.GetStringFor (_pi->first);
	tuple += " " + tokenset.GetStringFor (_wi->first);
	
	return tuple;

}

std::size_t TupleSet::GetToken (int i) const
{
	assert (i>=0 && i<=2);
	if (i == 0)
		return _ti->first;
	else if (i == 1)
		return _pi->first;
	else // if (i == 2)
		return _wi->first;
}