peterlane
/
uhferret-gem


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
							#if !defined documentlist_h
#define documentlist_h

/** 
 * This file is part of uhferret.
 *
 * Author::    Peter Lane
 * Copyright:: Copyright 2011, Peter Lane.
 * License::   GPLv3
 *
 * uhferret is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * uhferret is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <assert.h>
#include <vector>

#include "tokenset.h"
#include "tupleset.h"
#include "document.h"

/** DocumentList maintains a list of documents, a TokenSet of identified Tokens and 
  *    a TupleSet, which maps from sequences of three tokens to lists of documents 
  *    in which the trigrams were found.  
  * -- Methods are provided to calculate information about pairs of documents,
  *    such as Resemblance and Containment.
  * -- Note that the Documents are owned by this class although not created by it,
  *    and hence all Documents are destroyed with the DocumentList.
  */
class DocumentList
{
	public:
		DocumentList ();
		~DocumentList ();
		void AddDocument (std::string pathname, Document::DocumentType type = Document::TypeText);
		void AddDocument (std::string pathname, Document::DocumentType type, int id);
    Document * getDocument (std::size_t i) const;
		void RemoveDocument (Document * doc);
		TokenSet & GetTokenSet ();
		TupleSet & GetTupleSet ();
		void Clear ();
		int GetNewGroupId ();
		void ResetReading ();
		int Size () const;
		int NumberOfPairs () const;
		void RunFerret (int first_document = 0);
		void ReadDocument (int i);
		void ClearSimilarities ();
		void ComputeSimilarities ();
		int GetTotalTrigramCount ();
		int CountTrigrams (int doc_i);
		int CountMatches (int doc_i, int doc_j);
		float ComputeResemblance (int doc_i, int doc_j);
		float ComputeContainment (int doc_i, int doc_j);
		// check if given trigram is in both the indexed documents
		bool IsMatchingTrigram (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
		// convert given trigram into a string
		std::string MakeTrigramString (std::size_t t0, std::size_t t1, std::size_t t2);
		// collect all the matching trigrams in the two documents into a vector of strings
    std::vector<std::string> CollectMatchingTrigrams (int doc1, int doc2);
	private:
		std::vector<Document *>	_documents;
		TokenSet		_token_set;
		TupleSet		_tuple_set;
		std::vector<int>	_matches;
		int			_last_group_id;
};

#endif