deduplicate-csv.scm 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. #!/bin/sh
  2. # -*- scheme -*-
  3. exec guile -e main -s "$0" "$@"
  4. !#
  5. ;; Remove duplicate entries from the csv file (these are due to
  6. ;; downloading multiple versions of the same ID).
  7. (use-modules (ice-9 rdelim) ; for read-line
  8. (ice-9 i18n)
  9. (srfi srfi-1) ; first, second, third
  10. )
  11. (define (deduplicate infile outfile)
  12. (let ((known (make-hash-table))
  13. (inport (open-input-file infile))
  14. (outport (open-output-file outfile)))
  15. ;; first copy the header
  16. (display (read-line inport) outport)
  17. (newline outport)
  18. (let copy-dedup ((line (read-line inport)))
  19. (cond
  20. ((eof-object? line)
  21. #t)
  22. (else
  23. (let* ((columns (string-split line #\;))
  24. (source (first columns))
  25. (target (second columns))
  26. (key (string-append source target)))
  27. (when (not (hash-ref known key))
  28. (hash-set! known key #t)
  29. (display line outport)
  30. (newline outport))
  31. (copy-dedup (read-line inport))))))))
  32. (define (main args)
  33. (let ((infile (if (null? (cdr args))
  34. "trust.csv"
  35. (second args)))
  36. (outfile (if (or (null? (cdr args)) (null? (cdr (cdr args))))
  37. "trust-deduplicated.csv"
  38. (third args))))
  39. (deduplicate infile outfile)))