crawl-wot.scm 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. #!/bin/sh
  2. # -*- scheme -*-
  3. exec guile -e main -s "$0" "$@"
  4. !#
  5. ;; Simple WoT crawler
  6. (use-modules (web request)
  7. (web client)
  8. (web response)
  9. (web uri)
  10. (web http)
  11. (ice-9 threads)
  12. (ice-9 vlist)
  13. (ice-9 rdelim)
  14. (ice-9 futures)
  15. (rnrs io ports)
  16. (ice-9 match)
  17. (srfi srfi-42)
  18. (srfi srfi-1)
  19. (rnrs bytevectors)
  20. (sxml simple)
  21. (sxml match))
  22. (define base-url "http://127.0.0.1:8888")
  23. (define seed-id "USK@QeTBVWTwBldfI-lrF~xf0nqFVDdQoSUghT~PvhyJ1NE,OjEywGD063La2H-IihD7iYtZm3rC0BP6UTvvwyF5Zh4,AQACAAE/WebOfTrust/1502")
  24. (define (furl uri)
  25. (string-append base-url uri "?forcedownload=true"))
  26. (define (furl-uri uri)
  27. (string-append base-url "/" uri "?forcedownload=true"))
  28. (define (get url)
  29. (let* ((u (string->uri url))
  30. (r (build-request u))
  31. (p (open-socket-for-uri u))
  32. (rr (write-request r p))
  33. (rp (request-port rr)))
  34. (force-output p)
  35. (declare-opaque-header! "Location")
  36. ;(while (write (read-line p))
  37. ; (newline))
  38. (let ((resp (read-response rp)))
  39. (let ((c (response-code resp))
  40. (h (response-headers resp))
  41. (b (read-response-body resp)))
  42. (cond
  43. ((= c 301)
  44. (get (furl (assoc-ref h 'location))))
  45. ((= c 200)
  46. (cond
  47. ((equal? '(text/html (charset . "utf-8")) (assoc-ref h 'content-type))
  48. (utf8->string b))
  49. ((equal? '(application/force-download) (assoc-ref h 'content-type))
  50. (utf8->string b))
  51. (else (assoc-ref h 'content-type))))
  52. (else c))))))
  53. (define (non-breaking-sxml-reader xml-port)
  54. (catch #t
  55. (lambda () (xml->sxml xml-port))
  56. (lambda (key . args) (format #t "~A: ~A" key args)(newline) '())))
  57. (define (snarf-wot-ids xml-port)
  58. (let ((sxml (non-breaking-sxml-reader xml-port)))
  59. (let ((uris '()))
  60. (let grab-uris ((sxml sxml))
  61. (match sxml
  62. (('Identity uri) (set! uris (cons uri uris)))
  63. ((a b ...)
  64. (map grab-uris sxml))
  65. (else '())))
  66. uris)))
  67. (define (wot-uri-key uri)
  68. (let ((index (string-index uri #\/)))
  69. (if index
  70. (string-take uri index)
  71. uri))) ;; no / in uri, so it is already a key.
  72. (define (wot-uri-filename uri)
  73. (let ((u (if (string-prefix? "freenet:" uri)
  74. (substring uri 8)
  75. uri)))
  76. (string-join (string-split u #\/) "-")))
  77. (define (dump-wot-id uri filename)
  78. (let ((u (if (string-prefix? "freenet:" uri)
  79. (substring uri 8)
  80. uri)))
  81. (format #t "Download to file ~A\n" filename)
  82. (if (string-prefix? "USK@" u)
  83. (let ((data (get (furl-uri u))))
  84. (if (string? data)
  85. (let ((port (open-output-file filename)))
  86. (put-string port data)
  87. (close-port port))
  88. (error (format #t "tried to save in file ~A\n" filename))))
  89. (error (format #t "tried to save in file ~A\n" filename)))))
  90. (define (flatten l)
  91. "Flatten a nested list into a single list."
  92. (cond ((null? l) '())
  93. ((list? l) (append (flatten (car l)) (flatten (cdr l))))
  94. (else (list l))))
  95. (define* (crawl-wot seed-id #:key (redownload #f))
  96. ;; TODO: add (flatten ...) with Guile 2.1.x (currently it gives a stack overflow)
  97. (let ((known '()))
  98. (let crawl ((seed seed-id))
  99. ;; save the data
  100. (if (catch 'misc-error
  101. (lambda () (let* ((filename (wot-uri-filename seed))
  102. (dump (lambda () (dump-wot-id seed filename))))
  103. (if (and (not redownload) (file-exists? filename))
  104. (let* ((s (stat filename))
  105. (size (stat:size s)))
  106. (if (= size 0)
  107. (dump)
  108. (format #t "Use local copy of file ~A (redownload ~A).\n" filename redownload)))
  109. (dump))
  110. #f))
  111. (lambda (key . args) #t))
  112. known
  113. ;; snarf all uris
  114. (let ((uris (call-with-input-file (wot-uri-filename seed) snarf-wot-ids)))
  115. ;; (write seed)(newline)
  116. ;; (when (not (null? uris))
  117. ;; (write (car uris))(newline))
  118. (let ((new (list-ec (: u uris) (if (and
  119. (not (pair? u)) ; TODO: this is a hack. I do not know why u can be the full sxml. Seems to happen with IDs who do not have any trust set.
  120. (not (member (wot-uri-key u) known)))) u)))
  121. (when (not (null? new))
  122. (display "new: ")
  123. (write (car new))(newline))
  124. (when (not (null? known))
  125. (display "known: ")
  126. (write (car known))(newline)(write (length known))(newline))
  127. (set! known (lset-union equal?
  128. (list-ec (: u new) (wot-uri-key u))
  129. known))
  130. (if (null? new)
  131. known
  132. (lset-union equal? known (par-map crawl new)))))))))
  133. (define (parse-datehint str)
  134. (let ((lines (string-split str #\newline)))
  135. `((version . ,(list-ref lines 1))
  136. (date . ,(list-ref lines 2)))))
  137. (define* (datehint-for-key key year #:key (sitename "WebOfTrust") (week #f))
  138. (string-append "SSK" (substring key 3)
  139. "/" sitename
  140. "-" "DATEHINT"
  141. "-" (number->string year)
  142. (if week (string-append "-WEEK-" (number->string week)) "")))
  143. (define (furl-key-name-version key name version)
  144. "Get a freenet URL for the key and the version"
  145. (furl-uri (string-append "SSK" (substring key 3) "/" name "-" version)))
  146. (define (download-by-weekly-date-hint uri year week)
  147. (let* ((weekuri (datehint-for-key (wot-uri-key uri) year #:week week))
  148. (hint (get (furl-uri weekuri))))
  149. (if (not (string? hint))
  150. #f
  151. (let* ((hint-alist (parse-datehint hint))
  152. (version (assoc-ref hint-alist 'version))
  153. (date (assoc-ref hint-alist 'date))
  154. (url (furl-key-name-version (wot-uri-key uri) "WebOfTrust" version))
  155. (filename (string-append date "/" (wot-uri-key uri) "-" version)))
  156. (when (not (file-exists? date))
  157. (mkdir date))
  158. (format #t "download to: ~A | for week ~A\n" filename week)
  159. (let ((data (get url)))
  160. (when (string? data)
  161. (let ((port (open-output-file filename)))
  162. (put-string port data)
  163. (close-port port))))
  164. filename))))
  165. (define (download-by-date-hint uri)
  166. "Download all versions of the ID, ordered by the week in the DATEHINT."
  167. ;; An uri looks like this: USK@QWW2a74OWrtN-aWJ80fjWhfFx8NlNrlU0dQfd3J7t1E,2g-wfM57Up9DV1qoEDMPcDU-KPskk0yyiYFz67ydSos,AQACAAE
  168. ;; A date hint for WoT looks like this: SSK@QWW2a74OWrtN-aWJ80fjWhfFx8NlNrlU0dQfd3J7t1E,2g-wfM57Up9DV1qoEDMPcDU-KPskk0yyiYFz67ydSos,AQACAAE-WebOfTrust-DATEHINT-2015
  169. ;; or
  170. ;; SSK@[key]/[sitename]-DATEHINT-[year]
  171. ;; SSK@[key]/[sitename]-DATEHINT-[year]-WEEK-[week]
  172. ;; SSK@[key]/[sitename]-DATEHINT-[year]-[month]
  173. ;; SSK@[key]/[sitename]-DATEHINT-[year]-[month]-[day]
  174. ;; see http://draketo.de/light/english/freenet/usk-and-date-hints
  175. ;; Approach: First check whether the ID has a date hint for each year. Then check each weak in the matching years.
  176. ;; download the versions into directories ordered as YEAR-month-day/SSK@...-WebOfTrust-version
  177. (let ((years (iota 10 2016 -1)) ; last 10 years
  178. (weeks (iota 52 1))) ; 1-52
  179. (delete #f ;; only return the filenames of successful downloads
  180. (par-map (lambda (year)
  181. (let* ((yearuri (datehint-for-key (wot-uri-key uri) year))
  182. (hint (get (furl-uri yearuri))))
  183. (if (not (string? hint))
  184. #f
  185. (let* ((hint-alist (parse-datehint hint))
  186. (date (assoc-ref hint-alist 'date))
  187. (month (string->number (list-ref (string-split date #\-) 1)))
  188. (min-week (* month 4))) ; avoid trying to download weeks which cannot be available.
  189. (format #t "Downloading key ~a starting in month ~a for weeks ~a to 52\n" yearuri month min-week)
  190. (delete #f ;; only return the filenames of successful downloads
  191. (n-par-map 10 (lambda (week)
  192. (if (< week min-week) ; avoid weeks earlier than the date in the yearly date hint
  193. #f
  194. (download-by-weekly-date-hint uri year week)))
  195. weeks))))))
  196. years))))
  197. (define (main args)
  198. (let ((seed-id (if (null? (cdr args))
  199. seed-id
  200. (car (cdr args)))))
  201. (let ((seed (if (string-index seed-id #\/)
  202. seed-id
  203. (string-append "USK" (string-drop seed-id 3) "/WebOfTrust/-1")))) ; -1 can also return 0
  204. ;; (crawl-wot seed))))
  205. ;; (write (download-by-date-hint seed)))))
  206. (par-map download-by-date-hint
  207. (crawl-wot seed)))))