123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702 |
- (define-module (data-mining classification decision-trees)
- #:use-module (data-mining dataset)
- #:use-module (data-mining attributes)
- #:use-module (srfi srfi-1)
- #:use-module (srfi srfi-9) ;define-record-type
- #:use-module (data-mining util) ;(list-split substitute-bindings successive-apply)
- #:use-module (data-mining type-conversions) ;(as-string)
- #:use-module (srfi srfi-26)
- #:use-module (ice-9 format)
- #:use-module (ice-9 match)
- #:use-module (ice-9 receive)
- #:use-module (ice-9 threads) ;(par-map current-processor-count)
- #:use-module (ice-9 regex) ;(regexp-substitute/global)
- #:export (make-tree
- tree->c
- tree-data
- tree-value
- tree-children
- tree-depth
- tree-width
- leaf?
- internal?
- tree-apply
- classify
- leaf-values
- true?
- false?
- condition->c
- predicate->c
- predicate->record-filter
- dataset-label
- label-counts
- impurity
- gain
- stop-induction?
- cost-by-data
- induce-decision-tree-classifier
- default-measure
- default-min-dataset-size
- prefixing-port))
- (define author "Eric Bavier <bavier@member.fsf.org")
- (define date "Thu Apr 4 16:56:57 CDT 2013 ")
- (define copyright "GPLv3+")
- ;; =================================================
- ;; Condition class
- ;; =================================================
- ;;
- ;; Each branch of a decision tree will have an
- ;; associated condition which if true directs flow
- ;; down that branch. To determine the truth of a
- ;; <cond> the procedure PREDICATE is applied to
- ;; ARGS.
- ;;
- ;; Note: The reason we define this class instead of
- ;; just having each branch hold onto a procedure is
- ;; for output purposes. It's easier to determine
- ;; how to map a condition to a C statement if we
- ;; have direct access to the predicate.
- ;; (define-class <cond> ()
- ;; (predicate #:init-value (const #t)
- ;; #:init-keyword #:predicate)
- ;; (args #:init-value '()
- ;; #:init-keyword #:args))
- ;; (define-class <else> (<cond>))
- ;; Evaluate the <cond> C for some set of value
- ;; bindings established via the alist ENVIRON. If
- ;; any of the condition arguments is a symbol which
- ;; is in the set of keys for ENVIRON, then the
- ;; corresponding alist value is substituted before
- ;; evaluation. Symbols which do not contain an
- ;; associated value are left as-is for the
- ;; evaluation.
- ;; (define-method (true? (c <cond>) environ)
- ;; (apply (slot-ref c 'predicate)
- ;; (substitute-bindings (slot-ref c 'args)
- ;; environ)))
- ;; (define-method (true? (e <else>) environ) #t)
- ;; (define-method (false? (c <cond>) environ)
- ;; (not (true? c environ)))
- ;;; XXX: These simplified condition/predicate tests
- ;;; should suffice for us. No need to define extra
- ;;; classes.
- (define (true? pred environ)
- (define (quote-symbols lst)
- (recursive-map (lambda (x)
- (if (symbol? x) `(quote ,x) x))
- lst))
- (eval (quote-symbols
- (substitute-bindings pred environ))
- (interaction-environment)))
- (define false? (negate true?))
- ;;; C should be a symbol representing a scheme
- ;;; condition (e.g. <, >, string=?, etc...)
- (define (condition->c c port)
- (if (symbol? c)
- (condition->c
- (eval c (interaction-environment))
- port)
- (cond
- ((eq? c < ) (format port "<" ))
- ((eq? c <=) (format port "<="))
- ((eq? c = ) (format port "=="))
- ((eq? c eq?) (format port "=="))
- ((eq? c >=) (format port ">="))
- ((eq? c > ) (format port ">" )))))
- ;; Note: this works currently for standard numerical
- ;; comparison's, but could be extended for string
- ;; comparison by looking for `string=?' predicate
- ;; (and have the induction procedure generate such
- ;; predicates)
- (define (predicate->c pred port)
- (define (ensure-string x)
- (if (symbol? x) (symbol->string x) x))
- (match pred
- ((condition left right)
- (format port
- "if (~a ~a ~a)"
- (ensure-string left)
- (condition->c condition #f)
- (ensure-string right)))
- ((true) ;XXX: are there other cases where there
- ;might be a single symbol?
- (format port "else"))))
- (define (predicate->record-filter g)
- (lambda (i vals) (true? g vals)))
- ;; =================================================
- ;; Data Utilities
- ;; =================================================
- ;; Determine, on the whole, what label the entire
- ;; dataset should belong to. Currently, the only
- ;; implemented method for doing this is to do a
- ;; majority vote. If the dataset is empty, this
- ;; procedure returns #f.
- (define (dataset-label ds)
- (let ((counts (label-counts ds)))
- (if (null? counts)
- #f
- (car (apply max* cdr counts)))))
- (define (label-counts ds)
- (let ((score-board (make-hash-table)))
- (for-each
- (lambda (l)
- (hash-set! score-board l
- (1+ (hash-ref score-board l 0))))
- (dataset-label-attribute-values ds))
- (hash-map->list cons score-board)))
- (define default-measure (make-parameter 'entropy))
- ;; Calculates the impurity of a dataset, using some
- ;; measure [entropy,gini]
- (define* (impurity ds #:optional (measure (default-measure)))
- (let ((l (dataset-length ds)))
- (if (> l 0)
- (let ((P (map (compose (cut / <> l) cdr) (label-counts ds))))
- (cond ((eq? measure 'entropy)
- (fold (lambda (p acc)
- (if (= p 0) acc
- (- acc (* p (log p)))))
- 0 P))
- ((eq? measure 'gini)
- (- 1 (fold (lambda (p acc) (+ acc (* p p))) 0 P)))))
- 0)))
- ;; =================================================
- ;; Decision-Tree Node
- ;; =================================================
- ;;
- ;; The value of internal nodes could be used for
- ;; example, in an autotuning context, to initialize
- ;; and/or cache split attribute values. To be
- ;; useful in such an application, internal values
- ;; should be written before the branches, but in
- ;; general the value could also be written following
- ;; the output of the branches. For this purpose, a
- ;; user could pass a value to keyword arguments
- ;; #:pre-hook and #:post-hook, which could be either
- ;; strings, ports, or procedures. If a procedure,
- ;; then of one argument which is the current node
- ;; being output.
- (define-record-type tree
- (make-tree*
- value ;Some scheme value of procedure.
- branches ;list of (pred . child) pairs
- data ;The dataset. For leaf nodes, this is the
- ;set of entries that made their way through
- ;the decision tree before induction stopped.
- ;For internal tree nodes, this is the data
- ;before being split to it's branches.
- )
- tree?
- (value tree-value* set-tree-value!)
- (branches tree-branches)
- (data tree-data))
- (define (tree-children tree)
- (map cdr (tree-branches tree)))
- (define* (make-tree #:key
- (value #f)
- (branches '())
- (data #f))
- (make-tree* value branches data))
- ;;; Return the value associated with the decision
- ;;; tree node DT. Uses a cached result if
- ;;; available, otherwise, calculates the node value
- ;;; based on the node's associated data.
- (define* (tree-value tree #:optional (dflt #f))
- (or (tree-value* tree)
- ;; else set and return the value based on the
- ;; dataset label
- (let ((label (or (dataset-label (tree-data tree))
- (if (procedure? dflt)
- (dflt tree)
- dflt))))
- (set-tree-value! tree label)
- label)))
- ;; Compute the information gain at this node
- ;;
- ;; Gain = I_p - sum( w_c * I_c )
- ;;
- ;; where w is the weight of the child, which is the
- ;; ratio of records at child c to the number of
- ;; records at the parent node, I_p is the impurity
- ;; if the given node, and I_c is the impurity of
- ;; child node c.
- (define* (data-gain parent children)
- (let ((parent-size (dataset-length parent))
- (child-sizes (map dataset-length children))
- (parent-impurity (impurity parent))
- (child-impurities (map impurity children)))
- (- parent-impurity
- (fold (lambda (s w acc)
- (+ acc (* (/ w parent-size) s)))
- 0
- child-impurities
- child-sizes))))
- ;;; Returns the information gain from going from
- ;;; this node to its children. If DT is a leaf
- ;;; node, then this procedure simply returns the
- ;;; impurity if DT.
- (define* (node-gain tree)
- (data-gain (tree-data tree)
- (map tree-data
- (tree-children tree))))
- ;;; Return the maximum depth of the decision tree
- ;;; TREE.
- (define (tree-depth tree)
- (1+ (fold max 0 (map tree-depth
- (tree-children tree)))))
- ;; The number of leaf nodes in a decision-tree
- (define (tree-width tree)
- (if (leaf? tree)
- 1
- (fold + 0 (map tree-width
- (tree-children tree)))))
- ;; Is this tree node a leaf node?
- (define (leaf? tree)
- (null? (tree-branches tree)))
- ;; Is this tree node an internal node?
- (define internal? (negate leaf?))
- ;; TODO: define alist-style methods for decision
- ;; trees which return the value had from following
- ;; the decisions and branches through the tree to a
- ;; branch
- (define (tree-apply tree environ)
- (if (leaf? tree)
- ;; If we're at a leaf, the application simply
- ;; returns the node's value
- (tree-value tree)
- ;; Otherwise, we send the request to a child tree.
- (tree-apply (direct tree environ) environ)))
- (define classify tree-apply)
- ;; Return a list of the scheme values at leaf nodes.
- (define-public (leaf-values tree)
- (if (leaf? tree)
- (list (tree-value tree))
- (concatenate (map leaf-values
- (tree-children tree)))))
- ;; Direct flow from this tree node down one of its
- ;; branches to another tree node by returning the
- ;; first of TREE's children for which the guard
- ;; condition is true
- (define (direct tree environ)
- (find (compose (cut true? <> environ) car)
- (tree-branches tree)))
- ;; Get the decision-tree node resulting from
- ;; following the branches. With a single argument
- ;; this procedure returns the input dt node. With
- ;; optional arguments, return the node from
- ;; following the indexed children branches. For
- ;; example::
- ;;
- ;; (branch test-tree 1 2)
- ;;
- ;; Gets the second child node of the first child
- ;; node of test-tree
- (define (branch tree . rest)
- (if (null? rest) tree
- (let ((child (cdr (list-ref (tree-branches tree)
- (car rest)))))
- (apply branch child (cdr rest)))))
- ;;; ================================================
- ;;; Tree Splits
- ;;; ================================================
- (define-record-type split
- (make-split*
- score
- predicates
- datasets)
- split?
- (score split-score)
- (predicates split-predicates)
- (datasets split-datasets))
- ;;; Create a split by applying DISSECTOR to DATASET.
- ;;; The attribute that DISSECTOR compares against is
- ;;; removed from sub-datasets if dissector has a
- ;;; length greater than 2. The thought is that if
- ;;; DISSECTOR is a multi-way split, then there is no
- ;;; sense splitting on it later.
- (define (make-split dataset dissector)
- (let* ((filter-preds (map predicate->record-filter
- dissector))
- (remove-attr? (> (length dissector) 2))
- (subdata (dataset-partition-records
- filter-preds dataset)))
- (make-split* (data-gain dataset subdata)
- dissector
- (if remove-attr?
- ;; XXX: assume attribute name
- ;; is always the second symbol
- ;; in a predicate.
- (let ((new-cols
- (delete
- (cadar dissector)
- (dataset-attribute-indices
- dataset))))
- (map (cut make-dataset/shared
- <> #:columns new-cols)
- subdata))
- subdata))))
- ;;; If no good splits are found, returns #f.
- ;;; Otherwise returns a split.
- ;;;
- ;;; The numbers of attributes in each DATA may be
- ;;; different from DS, depending on what split was
- ;;; performed (e.g. a multi-way split may mean that
- ;;; the attribute is no longer suitable for future
- ;;; splitting, so is simply removed).
- (define (best-split dataset)
- (let ((candidate-dissectors
- (concatenate (map (lambda (attr-idx)
- (attribute-dissectors
- (dataset-attribute dataset
- attr-idx)
- (dataset-attribute-values
- dataset attr-idx)))
- (dataset-attribute-indices
- dataset #:with-label #f)))))
- ;; TODO: We need to evaluate the extremum in
- ;; parallel. Bonus points if we could get
- ;; candidate-dissectors to be generated lazily.
- ;; (format #t "Have ~a candidate dissectors for dataset of length ~a\n"
- ;; (length candidate-dissectors)
- ;; (dataset-length dataset))
- (and (not (null? candidate-dissectors))
- (receive (dissector split)
- (apply par-extremum+value*
- (cut make-split dataset <>)
- (lambda (s1 s2) (> (split-score s1)
- (split-score s2)))
- candidate-dissectors)
- split))))
- ;;; ==================================================
- ;;; Decision Tree Induction (the hard stuff)
- ;;; ==================================================
- (define-public (pure-data? ds)
- (= (length (delete-duplicates
- (dataset-label-attribute-values ds)))
- 1))
- (define-public default-min-dataset-size (make-parameter 2))
- (define-public (small-data? ds)
- (< (dataset-length ds) (default-min-dataset-size)))
- ;;; Return #t if there are no more attributes to
- ;;; split on, or if all records have the same
- ;;; attribute values
- (define-public (no-split-attrs? ds)
- (define (zero? n) (= n 0))
- (or (dataset-empty? ds)
- ;; Stops on the first attribute that has
- ;; non-unique values. So, would most of the
- ;; time only scan one attribute.
- (every (lambda (attr-idx)
- ((compose zero?
- length
- delete-duplicates
- (cut dataset-attribute-values
- ds <>))
- attr-idx))
- (dataset-attribute-indices ds))))
- ;;; Apply some standard tests to data, as well as
- ;;; any predicates in PREDS, which should be
- ;;; procedures of arity 1: taking a dataset and
- ;;; returning non-#f if induction should stop.
- (define (stop-induction? data . preds)
- (find (cut <> data)
- (append (list pure-data?
- small-data?
- no-split-attrs?)
- preds)))
- ;; Return a procedure which, when given a dataset,
- ;; returns #t if it is determined that this data is
- ;; "cheap" to leave as-is without further splitting,
- ;; as determined by the following:
- ;;
- ;; 1) determine the current label, L, of the data, D
- ;;
- ;; 2) determine the "cost" of assigning each record,
- ;; r, in D to class L, by looking up (r, L) in
- ;; COST-DATA
- ;;
- ;; 3) If the maximum value of all such costs is
- ;; greater than COST-THRESHOLD, then return #f.
- ;;
- ;; A #t result can be interpreted as meaning that it
- ;; is not costly to assign the current label to D,
- ;; even if the data is otherwise relatively "impure".
- ;;
- ;; The 'case-weight' argument can take a few forms:
- ;;
- ;; - If a symbol, interpreted as the tag of an
- ;; attribute in COST-DATA which contains weight
- ;; numbers for each record
- ;;
- ;; - If a procedure, it should return a weight
- ;; number when called with an record index.
- ;;
- (define* (cost-by-data cost-data
- #:key
- (cost-threshold 0.05)
- (case-weight (const 1.0)))
- (let ((weight (if (procedure? case-weight)
- case-weight
- ;; else symbol
- (lambda (i)
- (dataset-ref cost-data i case-weight)))))
- (lambda (data)
- (let* ((label (dataset-label data))
- (max-cost
- (apply max
- (map (lambda (r)
- (let ((cost (dataset-ref cost-data
- r label)))
- (* (weight r) cost)))
- (dataset-record-indices data)))))
- (< max-cost cost-threshold)))))
- ;;; Induce a decision tree classifier from DATA.
- ;;;
- ;;; The stop? keyword argument should be a procedure
- ;;; of arity 1 which accepts a dataset, and returns
- ;;; #t if induction should stop for the given data
- ;;; (i.e. the data forms a leaf node).
- (define* (induce-decision-tree-classifier
- data
- #:key
- (stop? stop-induction?)
- (default-class 'null)
- (tree-value (cut tree-value <> default-class)))
- (if (stop? data)
- (make-tree #:data data)
- (let ((next-split (best-split data)))
- (if (or (eq? next-split #f)
- (= (split-score next-split) 0))
- (make-tree #:data data)
- (let* ((branches
- ;; TODO: This could most likely be
- ;; parallelized.
- (map (lambda (pred data)
- (let* ((split-tree
- (induce-decision-tree-classifier
- data
- #:stop? stop?
- #:default-class default-class
- #:tree-value tree-value)))
- (cons pred split-tree)))
- (split-predicates next-split)
- (split-datasets next-split)))
- (branch-trees (map cdr branches)))
- (if (and
- (every leaf? branch-trees)
- (= (length (delete-duplicates
- (map tree-value branch-trees)))
- 1))
- ;; No reason to split of each
- ;; branch has the same value.
- (make-tree #:data data)
- (make-tree
- #:data data
- #:branches branches)))))))
- ;;; ==================================================
- ;;; Decision Tree Output
- ;;; ==================================================
- ;;; Create a port which prefixes every line printed
- ;;; to PORT with the string PRE.
- (define (prefixing-port port pre)
- (let ((at-newline #t))
- (make-soft-port
- (vector
- (lambda (c)
- (if (eq? c #\newline)
- (begin
- (write-char c port)
- (set! at-newline #t))
- (begin
- (when at-newline
- (display pre port)
- (set! at-newline #f))
- (write-char c port))))
- (lambda (s)
- (begin
- (when at-newline
- (display pre port)
- (set! at-newline #f))
- ;; Only prefix newlines that are followed by some
- ;; other characters on that line.
- (regexp-substitute/global port "(\n)([^\n]+)" s
- 'pre 1 pre 2 'post)
- (if (string=? (string-take-right s 1) "\n")
- (set! at-newline #t))))
- (lambda () (force-output port))
- #f
- (lambda () (close-port port))
- #f)
- "w")))
- (define (comment->c val)
- (let ((sv (as-string val)))
- (unless (string-null? sv)
- (format #f "/* ~a */" sv))))
- (define* (tree->c tree
- #:optional
- (out-port (current-output-port))
- #:key
- (pre-hook (compose comment->c tree-value))
- (leaf-hook (compose as-string tree-value))
- (post-hook (const "")))
- (let ((port (cond ((port? out-port) out-port)
- ((eq? out-port #t) (current-output-port))
- ((eq? out-port #f) (open-output-string))))
- (pre (pre-hook tree))
- (post (post-hook tree)))
- (begin
- (unless (or (eq? pre #f) (string-null? pre))
- (format port "~a" pre))
- (when (leaf? tree)
- (format port "~a" (leaf-hook tree)))
- (for-each
- (lambda (elem)
- (begin
- (predicate->c (car elem) port)
- (format port " {\n")
- (tree->c (cdr elem)
- (prefixing-port port " ")
- #:pre-hook pre-hook
- #:leaf-hook leaf-hook
- #:post-hook post-hook)
- (format port "} ")))
- (tree-branches tree))
- (format port "\n")
- (unless (string-null? post)
- (format port "~a\n" post))
- (when (eq? out-port #f)
- (get-output-string port)))))
- ;;; ================================================
- ;;; Tests
- ;;; ================================================
- (use-modules (srfi srfi-64)
- (data-mining test-util))
- (test-begin "decision-tree-test")
- ;;; Check predicates
- (test-assert (true? `(,< 5 10) '()))
- (test-assert (false? `(,< 10 5) '()))
- (test-assert (true? `(,< a 10) '((a . 5))))
- (test-assert (false? `(,< a 10) '((a . 12))))
- ;;; Need some test data for the following tests
- (define test-data
- (let ((input "
- rec,foo,bar,class
- 0,0.1,t,a
- 1,0.12,t,a
- 2,0.09,n,a
- 3,0.21,n,b
- 4,0.18,t,b
- 5,0.11,n,a
- 6,0.121,n,a
- 7,0.23,t,b
- 8,0.04,t,c")
- (attributes `(,(make-nominal-attribute)
- ,(make-numeric-attribute)
- ,(make-nominal-attribute
- #:read-value string->symbol)
- ,(make-nominal-attribute))))
- (delimited->dataset attributes
- 'class
- (open-input-string input)
- #:rec-idx 'rec)))
- (test-eq 2 (dataset-width test-data))
- (test-assert "test data columns"
- ((list-permutation? '(foo bar class))
- (dataset-attribute-indices test-data)))
- ;;; Check predicate->record-filter
- (define filter (predicate->record-filter `(,< foo 0.12)))
- ;;; Use that filter and make sure we get the correct
- ;;; number of records.
- (define ds/f (dataset-filter test-data
- #:record-pred filter))
- (test-eq "filtered length" 4 (dataset-length ds/f))
- (test-assert "filtered indices"
- ((list-permutation? (map string->symbol
- '("0" "2" "5" "8")))
- (dataset-record-indices ds/f)))
- ;;; Check dataset-label
- (test-equal "dataset label" "a" (dataset-label test-data))
- ;;; Check label-counts
- (test-assert "label-counts"
- (same-map? '(("a" . 5) ("b" . 3) ("c" . 1))
- (label-counts test-data)))
- ;;; Check induce-decision-tree-classifier
- (define test-tree
- (induce-decision-tree-classifier test-data))
- (format #t "depth: ~a\n" (tree-depth test-tree))
- (format #t "width: ~a\n" (tree-width test-tree))
- (tree->c test-tree
- #:pre-hook
- (lambda (tree)
- (let ((str-port (open-output-string)))
- (dataset->delimited
- (tree-data tree) str-port)
- (get-output-string str-port))))
- (test-end "decision-tree-test")
- ;;; Local Variables:
- ;;; fill-column: 52
- ;;; End:
|