crawl.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #!/usr/bin/env python
  2. #
  3. # "THE CRAWLER"
  4. #
  5. # In this three parts search engine, the crawler is responsible for retreiving
  6. # information from the filesystem and assembling it into a way for querying
  7. # later on.
  8. #
  9. # And btw, we overwrite the file everytime we crawl. For this release there is
  10. # no "delta checking" on the filesystem (at least not yet).
  11. #
  12. # VERSIONING:
  13. # - Mechanism that allows the filesystem tree information to be stored in a
  14. # database for querying afterwards. Most, if not all, of the filesystem can
  15. # be crawled now.
  16. # - The crawler is now a Python module; allowing it to be imported from another
  17. # python program.
  18. #
  19. import os
  20. import sys
  21. import sqlite3 as sql
  22. import time
  23. # all the "index-searchable" filetypes:
  24. indexable = [
  25. 'txt',
  26. 'csv',
  27. 'html',
  28. 'xml',
  29. 'xhtml',
  30. 'conf',
  31. 'sh',
  32. 'py',
  33. 'pl',
  34. 'abw',
  35. 'cpp',
  36. 'h'
  37. ]
  38. conn = sql.connect('index.db')
  39. cursor = conn.cursor()
  40. CLEAN_SLATE_QUERY = "DROP TABLE IF EXISTS fileindex"
  41. GENESIS_QUERY = """
  42. CREATE TABLE fileindex (
  43. path TEXT,
  44. name TEXT,
  45. extension TEXT,
  46. contents TEXT
  47. )
  48. """
  49. ADD_QUERY = u"""
  50. INSERT INTO fileindex (path, name, extension, contents)
  51. VALUES
  52. (?,?,?,?)
  53. """
  54. class Crawler(object):
  55. def __init__(self):
  56. # Create a database. EVERYTIME you crawl.
  57. global CLEAN_SLATE_QUERY
  58. global GENESIS_QUERY
  59. cursor.execute(CLEAN_SLATE_QUERY)
  60. cursor.execute(GENESIS_QUERY)
  61. conn.commit()
  62. def crawl(self, start_dir = '.'):
  63. print "Now crawling, starting from %s..." % start_dir
  64. print "It may take a few minutes!"
  65. self.begin = time.time()
  66. # The crawling cycle:
  67. for directory in os.walk(os.path.realpath(start_dir)):
  68. if directory[2] == []:
  69. continue
  70. else:
  71. filenames = directory[2]
  72. for filename in filenames:
  73. extension = filename.split('.')[-1]
  74. # Instead of printing to stdout, let's write it down:
  75. filepath = directory[0] + '/' + filename
  76. # Grab the contents if the file is plain text:
  77. if extension in indexable:
  78. # we may have issues here with reading some files.
  79. try:
  80. with file(filepath, 'r') as plaintext:
  81. content = plaintext.read()
  82. except IOError:
  83. content = None
  84. else:
  85. content = "binary"
  86. try:
  87. cursor.execute(ADD_QUERY, (filepath, filename, extension, content))
  88. # For some reason, even when you crawl as root, some parts of the
  89. # operating system fail to get indexed =P
  90. except sql.ProgrammingError:
  91. pass
  92. conn.commit()
  93. self.elapsed = time.time() - self.begin
  94. print "The operation took %d seconds" % self.elapsed
  95. def close(self):
  96. conn.close()
  97. if __name__ == "__main__":
  98. google = Crawler()
  99. if len(sys.argv) == 2:
  100. google.crawl(sys.argv[1])
  101. else:
  102. google.crawl()
  103. google.close()