kzimmermann
/
unixsearch


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							#!/usr/bin/env python
#
# "THE CRAWLER"
#
# In this three parts search engine, the crawler is responsible for retreiving
# information from the filesystem and assembling it into a way for querying 
# later on.
#
# And btw, we overwrite the file everytime we crawl. For this release there is
# no "delta checking" on the filesystem (at least not yet).
#
# VERSIONING:
# - Mechanism that allows the filesystem tree information to be stored in a 
#   database for querying afterwards. Most, if not all, of the filesystem can
#   be crawled now.
# - The crawler is now a Python module; allowing it to be imported from another
#   python program.
#

import os
import sys
import sqlite3 as sql
import time

# all the "index-searchable" filetypes:
indexable = [
    'txt',
    'csv',
    'html',
    'xml',
    'xhtml',
    'conf',
    'sh',
    'py',
    'pl',
    'abw',
    'cpp',
    'h'
]

conn = sql.connect('index.db')
cursor = conn.cursor()

CLEAN_SLATE_QUERY = "DROP TABLE IF EXISTS fileindex"
GENESIS_QUERY = """
    CREATE TABLE fileindex (
        path TEXT,
        name TEXT,
        extension TEXT,
        contents TEXT
    )
"""
ADD_QUERY = u"""
    INSERT INTO fileindex (path, name, extension, contents)
    VALUES
    (?,?,?,?)
"""
    
class Crawler(object):
    def __init__(self):
        # Create a database. EVERYTIME you crawl.
        global CLEAN_SLATE_QUERY
        global GENESIS_QUERY
        cursor.execute(CLEAN_SLATE_QUERY)
        cursor.execute(GENESIS_QUERY)
        conn.commit()
    
    def crawl(self, start_dir = '.'):
        print "Now crawling, starting from %s..." % start_dir
        print "It may take a few minutes!"
        self.begin = time.time()
        
        # The crawling cycle:
        for directory in os.walk(os.path.realpath(start_dir)):
            if directory[2] == []:
                continue
            else:
                filenames = directory[2]
                for filename in filenames:
                    extension = filename.split('.')[-1]
                    # Instead of printing to stdout, let's write it down:
                    filepath = directory[0] + '/' + filename
        
                    # Grab the contents if the file is plain text:
                    if extension in indexable:
                        # we may have issues here with reading some files.
                        try:
                            with file(filepath, 'r') as plaintext:
                                content = plaintext.read()
                        except IOError:
                            content = None
                    else:
                        content = "binary"
        
                    try:
                        cursor.execute(ADD_QUERY, (filepath, filename, extension, content))
                    # For some reason, even when you crawl as root, some parts of the
                    # operating system fail to get indexed =P
                    except sql.ProgrammingError:
                        pass 
        
        conn.commit()
        self.elapsed = time.time() - self.begin
        print "The operation took %d seconds" % self.elapsed
    
    def close(self):
        conn.close()

if __name__ == "__main__":
    google = Crawler()
    if len(sys.argv) == 2:
        google.crawl(sys.argv[1])
    else:
        google.crawl()
    google.close()