wikipedia-crawler.scm 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. ;; (use-modules (ice-9 receive)
  2. ;; (web client)
  3. ;; (web request)
  4. ;; (web http)
  5. ;; (web uri))
  6. ;; (http-get "https://en.wikipedia.org/w/api.php?action=query&titles=Main%20Page&prop=revisions&rvprop=content&format=xml")
  7. ;; (http-get (parse-header 'content-location "https://en.wikipedia.org/w/api.php?action=query&titles=Linux&prop=revisions&rvprop=content&format=xml"))
  8. ;; (parameterize ((current-http-proxy #f)) (http-get "https://en.wikipedia.org/w/api.php?action=query&titles=Linux&prop=revisions&rvprop=content&format=xml"))
  9. ;; (define a (open-output-file "uno.txt"))
  10. ;; scheme@(guile-user)> (display "?" a)
  11. ;; scheme@(guile-user)> (newline a)
  12. ;; scheme@(guile-user)> (close-output-port a)
  13. ;; scheme@(guile-user)> (newline a)
  14. ;; scheme@(guile-user)> (close-output-port a)
  15. ;; scheme@(guile-user)> (define a (open-output-file "uno.txt"))
  16. ;; scheme@(guile-user)> (display (parameterize ((current-http-proxy #f)) (http-get "https://en.wikipedia.org/w/api.php?action=query&titles=Linux&prop=revisions&rvprop=content&format=xml" #:keep-alive? #t)) a)
  17. ;https://en.wikipedia.org/w/api.php?format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=Stack%20Overflow
  18. (use-modules (ice-9 receive)
  19. (web client)
  20. (sxml simple))
  21. ;; ;;; Using receive.
  22. ;; (receive (head body)
  23. ;; (http-get "http://gnu.org/")
  24. ;; (begin
  25. ;; ;; Display HTTP Response
  26. ;; (newline)
  27. ;; (display "RESPONSE:")
  28. ;; (newline)
  29. ;; (display head)
  30. ;; ;; Display HTTP Response body
  31. ;; (newline)
  32. ;; (display "BODY:")
  33. ;; (newline)
  34. ;; (display body)
  35. ;; (newline)))
  36. ;; ;texto casi completo
  37. ;; (define b (car (cdr (cdr (xml->sxml a)))))
  38. ;; ;titulo del documento!!
  39. ;; (define c (car (cdr (cdr (cdr b)))))
  40. ;; ;not yet
  41. ;; (define d (caddr c))
  42. ;; ;ya casi!
  43. ;; (define g (cadr d))
  44. ;; ;;; mejor aun!
  45. ;; ;https://en.wikipedia.org/w/api.php?format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=Stack%20Overflow
  46. ;; (substring (texto (xml->sxml nautilus)) (string-contains (texto (xml->sxml nautilus)) "'''''Nautilus'''''"))
  47. ;; (define titulo (cadr (car (cdr (car (cdr g))))))
  48. ;; (define m (car (cdr (cdr g))))
  49. (use-modules (ice-9 receive)
  50. (web client)
  51. (sxml simple))
  52. (define (last lis)
  53. (cond [(and (not (pair? (car lis)))
  54. (null? (cdr lis))) (car lis)]
  55. [(not (null? (cdr lis))) (last (cdr lis))]
  56. [else (last (car lis))]))
  57. (define texto last)
  58. ;;; recive un texto "Stack%20Overflow"
  59. ;; y extrae las primeras 0-200 letras con substring
  60. ; al llamar http-get en (query text)
  61. ;;; lo convierte de xml a sxml
  62. ;;; y me da texto, es decir el (last list)
  63. ;;; wikipedia xml te da el ultimo como xml
  64. (define (search text)
  65. (substring
  66. (texto (xml->sxml
  67. (receive (head body)
  68. (http-get (query text))
  69. body)))
  70. 0
  71. 300))
  72. ;; como llamar a wikipedia
  73. ;;;; agregar para que convierta espacios a %20
  74. (define (query text)
  75. (format #f "https://en.wikipedia.org/w/api.php?format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=~a" text))
  76. (define (wikipedia text)
  77. (string-append (search text)
  78. "... More at: "
  79. (link-w text)))
  80. ;;;; agregar para que convierta espacios a guion bajo _
  81. (define (link-w text)
  82. (format #f "https://en.wikipedia.org/wiki/~a" text))