123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113 |
- #!/cray/css/users/bavier/bin/guile -s
- !#
- ;;
- ;; Procedures for manipulating data stored in a
- ;; 2D-like structure, where a row represents a
- ;; "record", and records may have a number of
- ;; "attributes" or columns
- ;;
- (define-module (data-mining dataset)
- #:use-module (data-mining type-conversions)
- #:use-module (data-mining util)
- #:use-module (data-mining hash-util)
- #:use-module (data-mining attributes)
- #:use-module (data-mining indexed-matrix)
- #:use-module (srfi srfi-1) ;;autoload (srfi srfi-1) (first second third last every)
- #:use-module (srfi srfi-9)
- #:use-module (srfi srfi-19) ;;(date->string time-utc->date)
- #:use-module (srfi srfi-26) ;;(cut cute)
- #:use-module (ice-9 receive) ;;(receive)
- #:use-module (ice-9 rdelim) ;;(read-line write-line)
- #:use-module (ice-9 regex) ;;(string-match make-regexp regexp-exec)
- #:export (make-dataset
- make-dataset/shared
- dataset-length
- dataset-width
- dataset-empty?
- ;; arff->dataset
- delimited->dataset
- dataset->delimited
- dataset-set!
- dataset-ref
- dataset-filter
- dataset-partition-records
- dataset-attributes
- dataset-attribute
- dataset-attribute-ref
- dataset-attribute-set!
- dataset-attribute-indices
- dataset-attribute-values
- dataset-label-idx
- dataset-label-attribute-values
- dataset-record-indices
- dataset-derive-attribute!
- dataset-entry-value-alist))
- (define author "Eric Bavier <bavier@member.fsf.org>")
- (define date "2014 June 3")
- (define copyright "GPLv3+")
- ;;
- ;; This is free software released under the GPLv3, or later
- ;;
- ;; =================================================
- ;; Dataset Object
- ;; =================================================
- (define-record-type dataset
- (make-dataset*
- entries ;2D matrix containing dataset
- ;values
- attribute-table ;A collection of the attributes
- ;describing each column of the
- ;dataset
- label-idx ;The name of the label attribute
- )
- dataset?
- (entries dataset-entries)
- (attribute-table dataset-attribute-table)
- (label-idx dataset-label-idx set-dataset-label-idx!))
- (define (attribute-list->attributes lst)
- (fold
- ;; Allow a mixing of simple symbol declarations
- ;; for attributes and concrete attribute types.
- (lambda (e hash)
- (cond
- ((symbol? e)
- (let ((attr (symbol->attribute e)))
- (hash-set!
- hash (attribute-name attr) attr)))
- ((attribute? e)
- (hash-set! hash (attribute-name e) e)))
- hash)
- (make-hash-table)
- lst))
- (define* (make-dataset attributes
- label-idx
- #:optional
- (entries (make-indexed-matrix)))
- (let ((attributes* (cond
- ((hash-table? attributes) attributes)
- (else (attribute-list->attributes attributes)))))
- (make-dataset* entries attributes* label-idx)))
- (define* (make-dataset/shared dataset
- #:key
- (rows (dataset-record-indices dataset))
- (columns (dataset-attribute-indices dataset)))
- ;; Make sure the dataset label attribute is included in the columns
- (let* ((columns* (lset-adjoin equal?
- columns
- (dataset-label-idx dataset)))
- (attribute-table* (hash-subset (dataset-attribute-table dataset)
- columns*)))
- (make-dataset
- attribute-table*
- (dataset-label-idx dataset)
- (make-indexed-matrix/shared (dataset-entries dataset)
- #:row-indices rows
- #:column-indices columns*))))
- ;;; Set the value of an entry in DS. We assume that ENTRY has already
- ;;; been vetted by the attribute it is being set for (i.e. no scrubbing
- ;;; takes place).
- (define (dataset-set! ds entry rec-idx attr-idx)
- (indexed-matrix-set!
- (dataset-entries ds)
- entry rec-idx attr-idx))
- (define (dataset-ref ds rec-idx attr-idx)
- (indexed-matrix-ref (dataset-entries ds) rec-idx attr-idx))
- ;;; Filter the entries of DS simultaneously on the contents of each record
- ;;; (row) as well as the contents of each attribute (column).
- ;;;
- ;;; The procedure RECORD-PRED is applied as (record-pred row-idx entry-alist)
- ;;; where row-idx is the unique identifier for the current row, and
- ;;; entry-alist is an association list where the keys are the attribute
- ;;; names and the values are the values each attribute assumes for the current
- ;;; record. Analogously for ATTRIBUTE-PRED. RECORD-PRED and ATTRIBUTE-PRED
- ;;; may be #t, in which case all rows or columns, respectively, are returned
- ;;; in the resulting dataset.
- ;;;
- ;;; Note: It is not possible to filter out the label attribute
- ;;;
- (define* (dataset-filter ds
- #:key
- (record-pred #t)
- (attribute-pred #t))
- (let ((recs
- (if (eq? record-pred #t)
- (dataset-record-indices ds)
- (filter identity ;filter out those that return #f
- (indexed-matrix-map-indexed-rows
- (lambda (i e) (and (record-pred i e) i))
- (dataset-entries ds)))))
- (attrs
- (if (eq? attribute-pred #t)
- (dataset-attribute-indices ds)
- (filter identity ;filter out those that return #f
- (indexed-matrix-map-indexed-columns
- (lambda (j e) (and (attribute-pred j e) j))
- (dataset-entries ds))))))
- (make-dataset/shared ds #:rows recs #:columns attrs)))
- ;;; Returns a list of datasets of length (1+ (length
- ;;; preds)). The records in d1 are those records
- ;;; which satisfy p1, the records in d2 are those
- ;;; which satisfay p2 but not p1, etc. The last
- ;;; dataset in the result contains those records
- ;;; which do not satisfy any of the predicates in
- ;;; PREDS.
- ;;;
- ;;; Each predicate in PREDS is applied as in
- ;;; dataset-filter
- (define (dataset-partition-records preds ds)
- (if (null? preds) (list ds)
- (reverse!
- (map
- (lambda (alist-ds)
- (make-dataset/shared ds #:rows (map car alist-ds)))
- (fold (lambda (p l)
- (let* ((part (fold (lambda (e acc)
- (if (p (car e) (cdr e))
- (cons (cons e (car acc))
- (cdr acc))
- (cons (car acc)
- (cons e (cdr acc)))))
- '(() . ())
- (car l)))
- (in (reverse! (car part)))
- (out (reverse! (cdr part))))
- (cons out (cons in (cdr l)))))
- (list (indexed-matrix-map-indexed-rows
- cons (dataset-entries ds)))
- preds)))))
- ;; Lookup an attribute ATTR in DS by name or index. Return #f if none found
- ;; by the given index.
- (define (dataset-attribute-ref ds attr-idx)
- (hash-ref (dataset-attribute-table ds) attr-idx))
- (define (dataset-attribute-set! ds attr-idx attr)
- (hash-set! (dataset-attribute-table ds) attr-idx attr))
- (define dataset-attribute
- (make-procedure-with-setter dataset-attribute-ref
- dataset-attribute-set!))
- ;;; Return a reference to the label attribute
- ;; (define (dataset-label-attribute-ref ds)
- ;; (dataset-attribute-ref ds (dataset-label-idx ds)))
- ;; (define (dataset-label-attribute-set! ds attr)
- ;; (let ((label-idx (dataset-label-idx ds))
- ;; (label-idx* (attribute-name attr)))
- ;; (hash-remove! (dataset-attribute-table ds) label-idx)
- ;; (hash-set! (dataset-attribute-table ds) lavel-idx* attr)
- ;; ;; Swap row references to labels from the old tag to the new
- ;; (indexed-matrix-reindex-column! (entries ds) oldtag newtag)
- ;; (set-dataset-label-idx! ds label-idx*)))
- ;; (define dataset-label-attribute
- ;; (make-procedure-with-setter dataset-label-attribute-ref
- ;; dataset-label-attribute-set!))
- ;;; Return a list of the concrete attribute validators of DS.
- (define (dataset-attributes ds)
- (hash-map->list (lambda (_ b) b) (dataset-attribute-table ds))
- (map (cute hash-ref (dataset-attribute-table ds) <>)
- (dataset-attribute-indices ds)))
- (define* (dataset-attribute-indices ds
- #:key
- (with-label #t))
- (let ((indices (hash-map->list (lambda (i _) i)
- (dataset-attribute-table ds))))
- (if with-label
- indices
- (remove (cute equal? <> (dataset-label-idx ds))
- indices))))
- ;;; Return a list of values belonging to the named attribute
- (define (dataset-attribute-values ds attr-idx)
- (indexed-matrix-column-entries (dataset-entries ds) attr-idx))
- ;;; Return a list of the label values
- (define (dataset-label-attribute-values ds)
- (dataset-attribute-values ds (dataset-label-idx ds)))
- ;;; Return a list of the record tag/index values
- (define (dataset-record-indices ds)
- (indexed-matrix-row-tags (dataset-entries ds)))
- ;;; Return the number of records this dataset has.
- (define (dataset-length ds)
- (indexed-matrix-length (dataset-entries ds)))
- ;;; Return the number of attributes this dataset has
- ;;; (does not including the label attribute)
- (define (dataset-width ds)
- (1- (hash-table-size (dataset-attribute-table ds))))
- (define (dataset-empty? ds)
- (or (= (dataset-length ds) 0)
- (= (dataset-width ds) 0)))
- ;; Add a new attribute ATTR to DS according to the
- ;; expression in PROC-EXP, which may contain
- ;; attribute tag names from DS. PROC-EXP will be
- ;; applied record-wise over DS, and the value for
- ;; ATTR at that record will be the result of
- ;; evaluating PROC-EXP with tag names replaced by
- ;; the attribute value of that record.'
- ;;
- ;; E.g. ::
- ;;
- ;; (dataset-derive-attribute!
- ;; d
- ;; (make-numeric-attribute #:name 'foo)
- ;; '(/ bar baz))
- ;;
- (define (dataset-derive-attribute! ds attr proc-exp)
- (let ((name (attribute-name attr))
- (ents (dataset-entries ds)))
- (begin
- (dataset-attribute-set! ds name attr)
- (for-each
- (lambda (i)
- (indexed-matrix-set!
- ents
- (eval (substitute-map proc-exp (dataset-entry-value-alist ds i))
- (interaction-environment))
- i name))
- (indexed-matrix-row-tags ents))
- (set-attribute-domain! attr (dataset-attribute-values ds name)))))
- (define (dataset-entry-value-alist ds rec-idx)
- (indexed-matrix-indexed-row (dataset-entries ds) rec-idx))
- ;; =================================================
- ;; Reading a dataset from an ARFF file
- ;; =================================================
- ;; Produces a new attribute according to the ARFF
- ;; @attribute tag-line
- (define (arff->attribute port)
- (define (attr-from-def name def)
- (cond ((equal? def "string") (make-string-attribute #:name name))
- ((or (equal? def "numeric")
- (equal? def "real")) (make-numeric-attribute #:name name))
- ((equal? def "integer") (make-integer-attribute #:name name))
- ((char=? #\{ (string-ref def 0))
- (let ((domain (string-split
- (substring def 1 (1- (string-length def)))
- #\,)))
- (make-nominal-attribute #:name name #:domain domain)))
- ((char=? #\[ (string-ref def 0))
- (let ((domain (string-split
- (substring def 1 (1- (string-length def)))
- #\,)))
- (make-ordinal-attribute #:name name #:domain domain)))))
- (define (bad-input in)
- (error (format #f "Cannot construct attribute from input: ~s\n" in)))
- (let* ((line (read-line port 'trim))
- (pieces (remove string-null? (string-split line #\ ))))
- (if (= (length pieces) 3)
- (let ((arff-tag (first pieces))
- (attr-name (string->symbol (second pieces)))
- (attr-def (third pieces)))
- (if (string-ci=? arff-tag "@attribute")
- (attr-from-def attr-name attr-def)
- (bad-input line))))))
- ;; Reads an ARFF-formatted data stream from PORT and
- ;; returns a new dataset with the contained data
- ;; (define* (arff->dataset #:optional (port (current-input-port)) . rest)
- ;; (let ((ds (make <dataset>)))
- ;; (begin
- ;; ((cut set-from-arff! ds port <...>) rest)
- ;; ds)))
- ;;; IGNORE-ATTRIBUTES and SELECT-ATTRIBUTES should be lists of symbols, which
- ;;; specify the attributes to ignore or select, respectively.
- ;;; SELECT-ATTRIBUTES may also be #t, in which case all attributes are
- ;;; selected.
- ;; (define* (arff->dataset #:optional
- ;; (port (current-input-port))
- ;; #:key
- ;; (ignore-attributes '())
- ;; (select-attributes #t)
- ;; (tag-index first)
- ;; (label-index last))
- ;; (let ((ds (make <dataset>)))
- ;; (begin
- ;; (skip-comments-and-whitespace port)
- ;; ;; Read relation tag and discard it
- ;; (let ((line (read-line port)))
- ;; (unless (string-match "@relation.*" line)
- ;; (error (format #f "Expecting '@relation' tag but got ~s" line))))
- ;; (skip-comments-and-whitespace port)
- ;; ;; Now we expect a block of '@attribute' statements
- ;; (let ((start-of-data? (let ((r (make-regexp "^@data.*$" regexp/icase)))
- ;; (cut regexp-exec r <>)))
- ;; (attrs '()))
- ;; (begin
- ;; ;; First, read all the attribute definitions
- ;; (do ((line (read-line port)
- ;; (begin (skip-comments-and-whitespace port)(read-line port))))
- ;; ;; Stop once the '@data' tag has been read
- ;; ((start-of-data? line))
- ;; (let ((a (arff->attribute (open-input-string line))))
- ;; (if (or (eq? select-attributes #t)
- ;; (member (tag a) select))
- ;; (begin
- ;; (set! attrs (append! attrs (list a)))))))
- ;; ;; Then, pick out those we're interested in
- ;; (let* ((all-tags (map tag attrs))
- ;; (select (cond
- ;; ((eq? select-attributes #t) all-tags)
- ;; ((eq? select-attributes #f) '())
- ;; (else select-attributes)))
- ;; (ignore (cond
- ;; ((eq? ignore-attributes #f) '())
- ;; (else ignore-attributes)))
- ;; (selected-tags
- ;; (map (lambda (s)
- ;; (cond ((integer? s) (list-ref all-tags s))
- ;; (else
- ;; (let ((ss (as-symbol s)))
- ;; (if (member ss all-tags)
- ;; ss
- ;; (error
- ;; (format #f
- ;; "Selected tag ~s not an attribute"
- ;; s)))))))
- ;; (lset-difference equal? select ignore)))
- ;; (selected-indices
- ;; (map (let ((tag-map (list->index-map all-tags)))
- ;; (cut assq-ref tag-map <>))
- ;; selected-tags))
- ;; (ti (cond
- ;; ((procedure? tag-index) (tag-index selected-indices))
- ;; ((symbol? tag-index)
- ;; (list-index (cut eq? tag-index <>)
- ;; all-tags))
- ;; (else tag-index)))
- ;; (li (cond
- ;; ((procedure? label-index) (label-index selected-indices))
- ;; ((symbol? label-index)
- ;; (list-index (cut eq? label-index <>)
- ;; all-tags))
- ;; (else label-index)))
- ;; (label-attr (list-ref attrs li))
- ;; (tag-label-list (if tag-index (list li ti) (list li)))
- ;; (mask (lset-union! = selected-indices tag-label-list))
- ;; (entry-attributes (take-indices
- ;; attrs
- ;; (lset-difference = mask (if tag-index
- ;; (list ti) '())))))
- ;; (slot-set! (entries ds) 'col-tags
- ;; (map tag entry-attributes))
- ;; (for-each (lambda (a) (dataset-attribute-set! ds (tag a) a))
- ;; entry-attributes)
- ;; (set! (dataset-label-attribute ds) label-attr)
- ;; ;; Now all the attributes are loaded in ATTRS,
- ;; ;; create a dataset and set its values from
- ;; ;; the block after '@data'
- ;; (set-delimited! ds mask port #:delimiter #\,
- ;; #:tag-index ti #:label-index li))))
- ;; ds)))
- ;;; Shorthand for reading lines from delimited input that may contain
- ;;; whitespace lines and comments
- (define* (next-line port #:optional (handle-delim 'trim))
- (begin
- (skip-comments-and-whitespace port)
- (read-line port handle-delim)))
- ;;; Read a dataset from delimited text.
- ;;;
- ;;; ATTRIBUTES should be a list of attributes that describe the columns
- ;;; of the input data. The length of ATTRIBUTES should be the same as
- ;;; the number of columns in the input data. If any element of
- ;;; ATTRIBUTES is #f, then that column in the input will be ignored.
- ;;;
- ;;; If HEADER is #t, then assume there is a header line and read
- ;;; attribute indices from that. Indices read in such a way will
- ;;; override any names/indices that the attributes in ATTRIBUTES already
- ;;; had. If HEADER is #f then we assume that attributes already have
- ;;; names set.
- (define* (delimited->dataset attributes
- label-idx
- #:optional
- (port (current-input-port))
- #:key
- (delimiter #\,)
- (header #t)
- (rec-idx #f))
- (let* ((attribute-columns (list-indices attribute? attributes))
- (attributes* (filter attribute? attributes))
- (attribute-indices
- (if header
- ;; Read attribute indices from header line
- (let* ((line (next-line port))
- (pieces (map (cute list-ref
- (string-split line delimiter)
- <>)
- attribute-columns))
- (indices (map string->symbol pieces)))
- (map (lambda (attr idx)
- (set-attribute-name! attr idx))
- attributes*
- indices)
- indices)
- (map attribute-name attributes)))
- ;; We need an input attribute for the record index column, but
- ;; that attribute should not be added to the dataset.
- (dataset-attrs (if rec-idx
- (remove (lambda (a)
- (equal? (attribute-name a) rec-idx))
- attributes*)
- attributes*))
- (attribute-map (map cons attribute-indices attribute-columns))
- (dataset (make-dataset dataset-attrs label-idx)))
- (set-delimited! dataset attribute-map port
- #:delimiter delimiter
- #:rec-idx rec-idx)
- dataset))
- (define* (dataset->delimited dataset
- #:optional
- (port (current-output-port)))
- (let ((attr-indices (dataset-attribute-indices dataset)))
- (begin
- (format port "rec,~{~a~^,~}\n"
- attr-indices)
- (for-each
- (lambda (rec-idx)
- (let ((values (dataset-entry-value-alist dataset rec-idx)))
- (format port "~a,~{~a~^,~}\n"
- (symbol->string rec-idx)
- (map (lambda (ai)
- (assoc-ref values ai))
- attr-indices))))
- (dataset-record-indices dataset)))))
- ;;; ATTRIBUTE-MAP must be a list of pairs (attr-idx . attr-col) where
- ;;; ATTR-IDX is the index of an attribute in DATASET (if not part of
- ;;; DATASET it will be ignored) and ATTR-COL is the 0-based index at
- ;;; which values for that attribute reside in the delimited input.
- ;;; REC-IDX, if given, should name one of the attributes in
- ;;; ATTRIBUTE-MAP that is to be used to assign indices to records. If
- ;;; not given each record will be assigned a "random" index.
- (define* (set-delimited! dataset
- ;; Maps from attribute index to the column in
- ;; the delimited input where that attribute's
- ;; values are found. Should include the label
- ;; attribute.
- attribute-map ;((attr-idx . attr-col) ...)
- #:optional
- (port (current-input-port))
- #:key
- (delimiter #\,)
- (rec-idx #f))
- (let record-loop ((count 0))
- (let ((line (next-line port)))
- (if (not (eof-object? line))
- (let* ((str-values (map string-trim (string-split line delimiter)))
- ;; Transform those string values into attribute values,
- ;; and construct input suitable for
- ;; set-dataset-entry-values!
- (values (filter-map/key+value
- (lambda (attr-idx attr-col)
- (and=> (dataset-attribute dataset attr-idx)
- (lambda (attr)
- (cons attr-idx
- (attribute-make-value
- attr
- (list-ref str-values attr-col))))))
- attribute-map))
- (rec-name (string->symbol
- (if rec-idx
- (list-ref str-values
- (assoc-ref attribute-map rec-idx))
- (string-append "rec" (number->string count))))))
- (set-dataset-entry-values! dataset rec-name values)
- (record-loop (1+ count)))))))
- ;;; For the record with index REC-IDX, set the values in VALUES, which
- ;;; must be an alist whose keys are attribute names/indices and whose
- ;;; values are the associated entry values for that attribute. An entry
- ;;; with index REC-IDX may or may not already exist in DATASET. VALUES
- ;;; must not necessarily contain a value for each attribute in DATASET,
- ;;; though if there are attribute indices in VALUES that are not part of
- ;;; DATASET then they will be ignored.
- (define (set-dataset-entry-values! dataset rec-idx values)
- (let ((attributes (dataset-attribute-table dataset)))
- (for-each/key+value
- (lambda (attr-idx value)
- (if (hash-ref attributes attr-idx)
- (dataset-set! dataset value rec-idx attr-idx)))
- values)))
- (define (skip-comments-and-whitespace port)
- (cond
- ((eof-object? (peek-char port)) (noop)) ;Nothing to be done
- ((char=? (peek-char port) #\%)
- (begin
- ;; Discard the comment line and continue
- (read-line port)
- (skip-comments-and-whitespace port)))
- ;; else check for whitespace-only lines
- (else
- (let ((line (read-line port 'concat)))
- (if (string-every char-set:whitespace line)
- ;; Discard this line and continue
- (skip-comments-and-whitespace port)
- ;; Else put the line back in port
- (unread-string line port))))))
- ;; Writing a dataset to an ARFF file
- ;; =================================
- ;; Write an attribute tag line for the given
- ;; attribute to PORT
- (define (attribute->arff attr port)
- (display (string-append
- (format #f "@attribute ~a " (attribute-name attr))
- (let ((domain (attribute-domain attr)))
- (cond
- ;; Guess the arff attribute type from
- ;; the characteristics of the domain
- ;; values.
- ;;
- ;; TODO: So far this only works if
- ;; domain is a list.
- ((every string=? domain) "string")
- ((every symbol? domain)
- (format #f "{~{~a~^,~}}" domain))
- (else "string")))
- "\n")
- port))
- ;; (define-method (attribute->arff (attr <attribute>) port)
- ;; (format port "@attribute ~a string\n" (tag attr)))
- ;; (define-method (attribute->arff (attr <string-attr>) port)
- ;; (format port "@attribute ~a string\n" (tag attr)))
- ;; (define-method (attribute->arff (attr <numeric-attr>) port)
- ;; (format port "@attribute ~a numeric\n" (tag attr)))
- ;; (define-method (attribute->arff (attr <nominal-attr>) port)
- ;; (format port "@attribute ~a {~a}\n" (tag attr)
- ;; (string-join (map as-string
- ;; (or (domain attr) '()))
- ;; ",")))
- ;; (define-method (attribute->arff (attr <ordinal-attr>) port)
- ;; (format port "@attribute ~a [~a]\n" (tag attr)
- ;; (string-join (map as-string
- ;; (or (domain attr) '()))
- ;; ",")))
- ;; (define* (dataset->arff ds #:optional (port (current-output-port)))
- ;; (begin
- ;; (when (slot-ref ds 'set-attr-domains)
- ;; (dataset-set-domains! ds))
- ;; (format port "% Dataset output by dataset.scm ~a\n"
- ;; (date->string (time-utc->date (current-time))))
- ;; (format port "@relation ~a\n\n"
- ;; (let ((fn (and port (port-filename port))))
- ;; (if fn
- ;; ;; Pull out just the basename without extension
- ;; (match:substring
- ;; (string-match "([^ /]+).arff" fn)
- ;; 1)
- ;; "foo")))
- ;; (format port "@attribute tag string\n")
- ;; (map (cut attribute->arff <> port)
- ;; (dataset-attributes ds #:with-label #t))
- ;; (format port "\n@data\n")
- ;; (indexed-matrix-for-each-row
- ;; (lambda (i elst)
- ;; (format port "~a,~{~a~^,~}\n"
- ;; (as-string i) (map as-string elst)))
- ;; (entries ds))))
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;;; Tests
- (use-modules (srfi srfi-64)
- (srfi srfi-1)
- (ice-9 format)
- (srfi srfi-43) ;vector library
- (data-mining test-util))
- (test-begin "dataset-test")
- ;;; Check helper routine attribute-list->attributes
- (define attr-hash (attribute-list->attributes
- `(integer
- nominal
- ,(make-string-attribute))))
- (hash-for-each
- (lambda (key value)
- (test-assert (attribute? value)))
- attr-hash)
- ;;; Check creating a dataset with symbols for attributes
- (define d0 (make-dataset `(integer string ,(make-nominal-attribute #:name 'class))
- 'class))
- (test-assert (dataset? d0))
- (test-eq "width with no entries" 2 (dataset-width d0))
- (test-eq "length with no entries" 0 (dataset-length d0))
- (test-eq "label-idx set" 'class (dataset-label-idx d0))
- (define d1 (make-dataset '(nominal) 'class))
- (test-assert (dataset-empty? d1))
- (test-eq "width of empty dataset" 0 (dataset-width d1))
- (define d2 (make-dataset `(,(make-integer-attribute #:name "i")
- ,(make-string-attribute #:name "s")
- ,(make-real-attribute #:name "r")
- ,(make-nominal-attribute #:name "n"))
- "n"))
- (define d2/s0 (make-dataset/shared d2))
- (test-assert "shared has all attributes"
- ((list-permutation? '("i" "s" "r" "n"))
- (dataset-attribute-indices d2/s0)))
- (test-eq "width of shared same" 3 (dataset-width d2/s0))
- (define d2/s1 (make-dataset/shared d2 #:columns '("i" "r" "n")))
- (test-assert "shared has subset of attributes"
- ((list-permutation? '("i" "r" "n"))
- (dataset-attribute-indices d2/s1)))
- (test-eq "width of narrower shared" 2 (dataset-width d2/s1))
- ;;; Add some entries and make sure it doesn't affect the original
- (dataset-set! d2/s1 2 "r0" "i")
- (dataset-set! d2/s1 2.71 "r0" "r")
- (dataset-set! d2/s1 'foo "r0" "n")
- (test-eq "ref" 2 (dataset-ref d2/s1 "r0" "i"))
- (test-eq "ref" 2.71 (dataset-ref d2/s1 "r0" "r"))
- (test-eq "ref" 'foo (dataset-ref d2/s1 "r0" "n"))
- (test-eq "length of single record dataset" 1 (dataset-length d2/s1))
- (test-eq "length of original dataset" 0 (dataset-length d2))
- (test-eq "length of record indices" 1 (length (dataset-record-indices d2/s1)))
- (test-assert "row indices"
- ((list-permutation? '("r0")) (dataset-record-indices d2/s1)))
- (test-assert "label values"
- ((list-permutation? '(foo)) (dataset-label-attribute-values d2/s1)))
- (test-assert "record values"
- ((list-permutation? '(("i" . 2) ("r" . 2.71) ("n" . foo)))
- (dataset-entry-value-alist d2/s1 "r0")))
- ;;; Create a new attribute and derive some values for it
- (dataset-derive-attribute!
- d2/s1
- (make-numeric-attribute #:name "s*")
- '(/ "r" "i"))
- (test-eq "length with derived" 1 (dataset-length d2/s1))
- (test-eq "width with derived" 3 (dataset-width d2/s1))
- (test-eq "width of original after shared derive"
- 3 (dataset-width d2))
- (define derived-value (assoc-ref (dataset-entry-value-alist d2/s1 "r0") "s*"))
- (test-eqv "derived value" 1.355 derived-value)
- (test-assert "original does not know about derived attribute"
- ((list-permutation? '("i" "r" "s" "n"))
- (dataset-attribute-indices d2)))
- ;;; Check set-dataset-entry-values!
- (set-dataset-entry-values! d2 "r1"
- '(("i" . 3) ("r" . 3.14) ("s" . "bust") ("n" . bar)))
- (test-eq "shared datasets unchanged"
- 0 (dataset-length d2/s0))
- (test-eq "shared datasets unchanged"
- 1 (dataset-length d2/s1))
- (test-eq "set values for new record changes length"
- 1 (dataset-length d2))
- (test-assert "new record indices"
- ((list-permutation? '("r1"))
- (dataset-record-indices d2)))
- ;;; Update the entry for the "i" attribute
- (set-dataset-entry-values! d2 "r1" '(("i" . 4)))
- (test-eq "updating values does not change dataset length"
- 1 (dataset-length d2))
- (test-assert "updated record values"
- ((list-permutation?
- '(("i" . 4) ("r" . 3.14) ("s" . "bust") ("n" . bar)))
- (dataset-entry-value-alist d2 "r1")))
- ;;; Values for "unknown" attribute indices should be ignored
- (set-dataset-entry-values! d2 "r1" '(("i" . 5) ("blaz" . flib)))
- (test-eq "no change for unknown attribute index"
- 1 (dataset-length d2))
- (test-eq "no change for unknown attribute index"
- 3 (dataset-width d2))
- (test-assert "updated record values"
- ((list-permutation?
- '(("i" . 5) ("r" . 3.14) ("s" . "bust") ("n" . bar)))
- (dataset-entry-value-alist d2 "r1")))
- ;;; Check set-delimited!
- (let* ((data "
- a,classy,1,4,blurp,qu,0.2
- b,classx,2,6,blub,qa,0.3")
- (foo (make-ordinal-attribute
- #:name 'foo
- #:read-value identity
- #:dissector-gen (cut ordinal-dissector
- <> <>
- string<? string=?)))
- (bar (make-string-attribute #:name 'bar))
- (baz (make-integer-attribute #:name 'baz))
- (bin (make-integer-attribute #:name 'bin))
- (bit (make-nominal-attribute
- #:name 'bit
- #:read-value string->symbol))
- (bug (make-string-attribute #:name 'bug))
- (baf (make-numeric-attribute #:name 'baf))
- (attribute-map `((foo . 0) (bar . 1) (baz . 2)
- (bin . 3) (bit . 4) (bug . 5) (baf . 6)))
- (datasets
- (vector (make-dataset (list foo bar baz bin bit bug baf) 'bar)
- (make-dataset (list foo bar bin) 'bar)
- (make-dataset (list foo bar bit baf) 'bit)
- (make-dataset (list bar baz bin bug) 'bug))))
- (vector-for-each
- (lambda (i d args)
- (apply set-delimited! d attribute-map
- (open-input-string data) args))
- datasets
- '#(() () () (#:rec-idx foo)))
- ;; Check that the rec-idx-col option worked correctly
- (test-assert "rec-idx-col record indices"
- ((list-permutation? '(a b))
- (dataset-record-indices (vector-ref datasets 3))))
- (vector-for-each
- (lambda (i d)
- (test-eq (format #f "length of dataset ~a" i)
- 2 (dataset-length d)))
- datasets)
- (vector-for-each
- (lambda (i d w)
- (test-eq (format #f "delimited width for dataset ~a" i)
- w (dataset-width d)))
- datasets
- #(6 2 3 3))
- (vector-for-each
- (lambda (i d v)
- (for-each
- (lambda (c)
- (test-assert (format #f "column values of ~a for dataset ~a"
- (car c) i)
- ((list-permutation? (cdr c))
- (dataset-attribute-values d (car c)))))
- v))
- datasets
- '#(((foo . ("a" "b")) (bin . (4 6)) (bit . (blurp blub)))
- ((bar . ("classy" "classx")) (bin . (4 6)))
- ((baf . (0.2 0.3))))))
- ;;; Check delimited->dataset
- (let* ((data "
- foo,bar,baz,bin,bit,bug,baf
- a,classy,1,4,blurp,qu,0.2
- b,classx,2,6,blub,qa,0.3")
- ;; Attribute names should be derived from the header
- (attributes `(,(make-ordinal-attribute
- #:read-value identity
- #:dissector-gen (cut ordinal-dissector
- <> <>
- string<? string=?))
- ,(make-string-attribute)
- ,(make-integer-attribute)
- ,(make-integer-attribute)
- ,(make-nominal-attribute
- #:read-value string->symbol)
- ,(make-string-attribute)
- ,(make-numeric-attribute)))
- (datasets `#(,(delimited->dataset attributes
- 'foo ;label-idx
- (open-input-string data))
- ,(delimited->dataset attributes
- 'bar ;label-idx
- (open-input-string data))
- ,(delimited->dataset attributes
- 'baf
- (open-input-string data)
- #:rec-idx 'foo)
- ,(delimited->dataset (list-mask attributes '(0 1 3 5))
- 'bar
- (open-input-string data)
- #:rec-idx 'foo)
- ,(delimited->dataset (list-mask attributes '(0 1 3 5 6))
- 'bar
- (open-input-string data)
- #:rec-idx 'foo))))
- (vector-for-each
- (lambda (i d w)
- (test-eq w (dataset-width d)))
- datasets
- #(6 6 5 2 3))
- (vector-for-each
- (lambda (i d)
- (test-eq 2 (dataset-length d)))
- datasets)
- (vector-for-each
- (lambda (i d c)
- (test-assert (format #f "dataset ~a column names" i)
- ((list-permutation? c)
- (dataset-attribute-indices d)))
- (test-assert (format #f "dataset ~a column attributes" i)
- ((list-permutation? c)
- (map attribute-name
- (dataset-attributes d)))))
- datasets
- '#((foo bar baz bin bit bug baf)
- (foo bar baz bin bit bug baf)
- (foo bar baz bin bit bug baf)
- (bug bin bar)
- (baf bug bin bar)))
- (vector-for-each
- (lambda (i d v)
- (for-each
- (lambda (c)
- (test-assert (format #f "column values of ~a for dataset ~a"
- (car c) i)
- ((list-permutation? (cdr c))
- (dataset-attribute-values d (car c)))))
- v))
- datasets
- '#(((foo . ("a" "b")) (bin . (4 6)) (bit . (blurp blub)))
- ((bar . ("classy" "classx")) (bin . (4 6)))
- ((baf . (0.2 0.3)))
- ((bug . ("qu" "qa")) (bin . (4 6)) (bar . ("classy" "classx")))
- ((baf . (0.2 0.3)) (bug . ("qu" "qa")))))
- (vector-for-each
- (lambda (i d l)
- (test-assert (format #f "label values for dataset ~a" i)
- ((list-permutation? l)
- (dataset-label-attribute-values d))))
- datasets
- '#(("a" "b")
- ("classy" "classx")
- (0.2 0.3)
- ("classy" "classx")
- ("classy" "classx")))
- ;; Check dataset-filter
- (vector-for-each
- (lambda (i d filters+assertions)
- (for-each
- (lambda (f) ;f is a pair ((row-pred . col-pred) (assertions...))
- (let* ((row-pred (caar f))
- (col-pred (cdar f))
- (assertions (cdr f))
- (ds/f (dataset-filter d
- #:record-pred row-pred
- #:attribute-pred col-pred)))
- (for-each
- (lambda (assertion)
- (test-assert (assertion ds/f)))
- assertions)))
- filters+assertions))
- datasets
- `#((((#t #|row-pred|# . #t #|col-pred|#)
- ,(lambda (d) ((list-permutation? '(foo bar baz bin bit bug baf))
- (dataset-attribute-indices d)))
- ,(lambda (d) ((list-permutation? '("a" "b"))
- (dataset-label-attribute-values d))))
- ((,(lambda (ri vals) (string=? (assoc-ref vals 'bar) "classy")) . #t)
- ,(lambda (d) ((list-permutation? '("a"))
- (dataset-label-attribute-values d)))
- ,(lambda (d) (= 1 (dataset-length d))))
- ((#t . ,(lambda (ai vals) (memq ai '(foo bar bin bit))))
- ,(lambda (d) ((list-permutation? '(foo bar bin bit))
- (dataset-attribute-indices d))))
- ((,(lambda (ri vals) (< (assoc-ref vals 'baf) 0.25)) . #t)
- ,(lambda (d) (= 1 (dataset-length d))))))))
- ;;; Check dataset-partition-records
- (let* ((data "
- rec,data,class
- 0,0.1,a
- 1,0.12,a
- 2,0.09,a
- 3,0.21,b
- 4,0.18,b
- 5,0.11,a
- 6,0.121,a
- 7,0.23,b
- 8,0.04,c")
- (attributes `(,(make-nominal-attribute)
- ,(make-numeric-attribute)))
- (dataset (delimited->dataset attributes
- 'class
- (open-input-string data)
- #:rec-idx 'rec))
- (parts `(,(dataset-partition-records
- `(,(lambda (ri vals)
- (< (assoc-ref vals 'data) 0.06))
- ,(lambda (ri vals)
- (< (assoc-ref vals 'data) 0.15)))
- dataset)
- ,(dataset-partition-records
- `(,(lambda (ri vals)
- (< (assoc-ref vals 'data) 0.12)))
- dataset))))
- (for-each
- (lambda (part l)
- (test-eq "partition parts" l (length part)))
- parts
- '(3 2))
- (for-each
- (lambda (part data-sizes)
- (for-each
- (lambda (ds size)
- (test-eq "partition size"
- size (dataset-length ds)))
- part data-sizes))
- parts
- '((1 5 3) (4 5))))
- (test-end "dataset-test")
- ;; (define (test-dataset->arff)
- ;; (begin
- ;; (let ((d (make <dataset>))
- ;; (d1 (make <dataset> #:attributes (list 'real 'integer 'ordinal 'nominal)))
- ;; (counter 0))
- ;; (begin
- ;; (for-each
- ;; (lambda (i)
- ;; (for-each
- ;; (lambda (j)
- ;; (begin
- ;; (dataset-set! d1 counter i j)
- ;; (set! counter (1+ counter))))
- ;; (col-tags (entries d1))))
- ;; (list 'e0 'e1 'e2 'e3 'e4)))
- ;; (dataset->arff d1))
- ;; (newline)))
- ;; Try reading in a dataset from arff. Output it to verify the contents.
- ;; (define (test-arff->dataset)
- ;; (begin
- ;; (let ((d (arff->dataset (open-input-string "
- ;; @relation bar
- ;; % This is a test dataset
- ;; @attribute tag string
- ;; @attribute funk string
- ;; @attribute foo numeric
- ;; @attribute bin numeric
- ;; @attribute bork [quick,quack]
- ;; @attribute frob {blurb,blip,blup}
- ;; @attribute label {a,b,c}
- ;; @data
- ;; e4,\"friz\",1.0,2.0,quick,blip,a
- ;; e3,\"fruz\",0.7,2.5,quick,blip,a
- ;; e2,\"frum\",1.2,2.0,quack,blurp,c
- ;; e1,\"fraz\",1.1,2.3,quack,blup,c
- ;; e0,\"frim\",1.6,2.9,quack,blup,b"))))
- ;; (begin
- ;; (dataset->arff d)
- ;; (newline)))
- ;; (let ((d (arff->dataset (open-input-string "
- ;; @relation bar
- ;; @attribute name string
- ;; @attribute class {a,b,c}
- ;; @attribute funk string
- ;; @attribute ignored [blarney,quack,silly]
- ;; @attribute val numeric
- ;; @data
- ;; e4,a,foo,friz,1.0
- ;; e3,b,bar,fratz,3.2
- ;; e2,b,biz,frumble,0.75
- ;; e1,c,bur,fram,0.01
- ;; e0,a,fit,frobble,10")
- ;; #:ignore-attributes (list 'ignored)
- ;; #:tag-index 'name
- ;; #:label-index 'class)))
- ;; (begin
- ;; (dataset->arff d)
- ;; (newline)))))
- ;; (define (test-set-delimited!)
- ;; (begin
- ;; (let ((d (make <dataset> #:attributes (list 'real 'integer))))
- ;; (begin
- ;; (set-delimited! d (iota 4) (open-input-string "
- ;; e1,1.0,2,a
- ;; e3,2.0,3,b
- ;; e8,3.0,4,a
- ;; e4,4.0,1,r"))
- ;; (dataset-set-domains! d)
- ;; (dataset->arff d)
- ;; (newline)))
- ;; ;; Test another invocation, with a non-trivial column-mask, alternate
- ;; ;; delimiter, and out-of-order tag and label indices.
- ;; (let ((d (make <dataset> #:attributes (list 'real 'integer))))
- ;; (begin
- ;; (set-delimited! d (list 0 1 2 4) (open-input-string "
- ;; a:1.0:2:fuzz:e1
- ;; b:2.0:3:fizz:e3
- ;; a:3.0:4:fizz:e8
- ;; r:4.0:1:fuzz:e4")
- ;; #:tag-index 4 #:label-index 0
- ;; #:delimiter #\:)
- ;; (dataset-set-domains! d)
- ;; (dataset->arff d)
- ;; (newline)))
- ;; ;; This invocation will create record tags
- ;; (let ((d (make <dataset> #:attributes (list 'nominal 'ordinal 'integer))))
- ;; (begin
- ;; (set-delimited! d (list 5 1 2 4) (open-input-string "
- ;; a,b,1,foo,10,classy
- ;; a,d,3,bar,20,classx
- ;; t,a,2,biz,5,classe")
- ;; #:tag-index #f #:label-index 5)
- ;; (dataset-set-domains! d)
- ;; (dataset->arff d)
- ;; (newline)))))
- ;; (define (test-dataset-filter)
- ;; (let ((d (make-test-dataset)))
- ;; (dataset->arff (dataset-filter
- ;; d
- ;; #:record-pred (lambda (rt elts)
- ;; (eq? (assoc-ref elts 'frob) 'blup))
- ;; #:attribute-pred #t))))
- ;; (define (make-test-dataset)
- ;; (arff->dataset (open-input-string "
- ;; @relation bar
- ;; % This is a test dataset
- ;; @attribute tag string
- ;; @attribute funk string
- ;; @attribute foo numeric
- ;; @attribute bin numeric
- ;; @attribute bork [quick,quack]
- ;; @attribute frob {blurp,blip,blup}
- ;; @attribute label {a,b,c}
- ;; @data
- ;; e4,\"friz\",1.0,2.0,quick,blip,a
- ;; e3,\"fruz\",0.7,2.5,quick,blip,a
- ;; e2,\"frum\",1.2,2.0,quack,blurp,c
- ;; e1,\"fraz\",1.1,2.3,quack,blup,c
- ;; e0,\"frim\",1.6,2.9,quack,blup,b")))
- ;; Local Variables:
- ;; fill-column: 72
- ;; End:
|