anonymize-csv.scm 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #!/bin/sh
  2. # -*- scheme -*-
  3. exec guile -e main -s "$0" "$@"
  4. !#
  5. ;; Double-Anonymize the trust.csv by replacing keys with
  6. ;; index-numbers. This prevents results from evaluations of the trust
  7. ;; graph from being applied directly to correlation attacks on Freenet
  8. ;; users.
  9. ;; TODO: use vhashes instead of regular hash tables.
  10. (use-modules (ice-9 rdelim)
  11. (ice-9 i18n)
  12. (srfi srfi-69) ; hash tables
  13. (srfi srfi-1) ; first, second, third
  14. )
  15. (define (set-add table . elements)
  16. (let add ((elements elements))
  17. (cond
  18. ((null? elements)
  19. table)
  20. (else
  21. (hash-table-set! table (car elements) #t)
  22. (add (cdr elements))))))
  23. (define (set-keys table)
  24. (hash-table-keys table))
  25. (define (set-size table)
  26. (hash-table-size table))
  27. (define (set->list-sorted table)
  28. (sort-list (set-keys table) string<?))
  29. (define (make-set)
  30. (make-hash-table))
  31. (define (get-ids port)
  32. (let collect-ids ((ids (make-set)))
  33. (let ((line (read-line port)))
  34. (cond
  35. ((eof-object? line)
  36. (set->list-sorted ids))
  37. (else
  38. (let* ((columns (string-split line #\;))
  39. (source (first columns))
  40. (target (second columns)))
  41. (collect-ids (set-add ids source target))))))))
  42. (define (index-ids-fun ids)
  43. (let ((id-to-index (make-hash-table)))
  44. (let fill-table ((ids ids)
  45. (index 0))
  46. (cond ((null? ids)
  47. id-to-index)
  48. (else
  49. (hash-table-set! id-to-index (car ids) index)
  50. (fill-table (cdr ids)
  51. (+ 1 index)))))
  52. (lambda (id) (hash-table-ref id-to-index id))))
  53. (define (check-csv-header port)
  54. (let ((header (read-line port))
  55. (required-header-lowercase "source;target"))
  56. (when (not (string-prefix? required-header-lowercase (string-locale-downcase header)))
  57. (error (format #f "input file must have header '~A' (regardless of case) but has header '~A'" required-header-lowercase header)))))
  58. (define (index-ids-from-file port)
  59. (check-csv-header port)
  60. (let* ((ids (get-ids port))
  61. (id->index (index-ids-fun ids)))
  62. id->index))
  63. (define (anonymize-ids id->index inport outport)
  64. (check-csv-header inport)
  65. (format outport "Source;Target;Weight\n")
  66. (let anonymize ((line (read-line inport)))
  67. (cond
  68. ((eof-object? line) #t)
  69. (else
  70. (let* ((columns (string-split line #\;))
  71. (source (id->index (first columns)))
  72. (target (id->index (second columns)))
  73. (weight (third columns)))
  74. (format outport "~A;~A;~A\n" source target weight))
  75. (anonymize (read-line inport))))))
  76. (define (main args)
  77. (let ((infile (if (null? (cdr args))
  78. "trust-deduplicated.csv"
  79. (second args)))
  80. (outfile (if (or (null? (cdr args)) (null? (cdr (cdr args))))
  81. "trust-anonymized.csv"
  82. (third args))))
  83. (let ((id->index (call-with-input-file infile index-ids-from-file))
  84. (inport (open-input-file infile))
  85. (outport (open-output-file outfile)))
  86. (anonymize-ids id->index inport outport)
  87. (close-port inport)
  88. (close-port outport))))