bavier
/
guile-data-mining


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702
							(define-module (data-mining classification decision-trees)
  #:use-module (data-mining dataset)
  #:use-module (data-mining attributes)
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-9)		;define-record-type
  #:use-module (data-mining util) ;(list-split substitute-bindings successive-apply)
  #:use-module (data-mining type-conversions) ;(as-string)
  #:use-module (srfi srfi-26)
  #:use-module (ice-9 format)
  #:use-module (ice-9 match)
  #:use-module (ice-9 receive)
  #:use-module (ice-9 threads) ;(par-map current-processor-count)
  #:use-module (ice-9 regex) ;(regexp-substitute/global)
  #:export (make-tree
	    tree->c
	    tree-data
	    tree-value
	    tree-children
	    tree-depth
	    tree-width

	    leaf?
	    internal?
	    tree-apply
	    classify
	    leaf-values

	    true?
	    false?
	    condition->c
	    predicate->c
	    predicate->record-filter

	    dataset-label
	    label-counts

	    impurity
	    gain
	    stop-induction?
	    cost-by-data
	    induce-decision-tree-classifier
	    default-measure
	    default-min-dataset-size

	    prefixing-port))

(define author    "Eric Bavier <bavier@member.fsf.org")
(define date      "Thu Apr  4 16:56:57 CDT 2013 ")
(define copyright "GPLv3+")

;; =================================================
;;                Condition class
;; =================================================
;;
;; Each branch of a decision tree will have an
;; associated condition which if true directs flow
;; down that branch.  To determine the truth of a
;; <cond> the procedure PREDICATE is applied to
;; ARGS.
;;
;; Note: The reason we define this class instead of
;; just having each branch hold onto a procedure is
;; for output purposes.  It's easier to determine
;; how to map a condition to a C statement if we
;; have direct access to the predicate.
;; (define-class <cond> ()
;;   (predicate #:init-value (const #t)
;; 	     #:init-keyword #:predicate)
;;   (args #:init-value '()
;; 	#:init-keyword #:args))
;; (define-class <else> (<cond>))

;; Evaluate the <cond> C for some set of value
;; bindings established via the alist ENVIRON.  If
;; any of the condition arguments is a symbol which
;; is in the set of keys for ENVIRON, then the
;; corresponding alist value is substituted before
;; evaluation.  Symbols which do not contain an
;; associated value are left as-is for the
;; evaluation.
;; (define-method (true? (c <cond>) environ)
;;   (apply (slot-ref c 'predicate)
;; 	 (substitute-bindings (slot-ref c 'args)
;; 			      environ)))
;; (define-method (true? (e <else>) environ) #t)

;; (define-method (false? (c <cond>) environ)
;;   (not (true? c environ)))

;;; XXX: These simplified condition/predicate tests
;;; should suffice for us.  No need to define extra
;;; classes.
(define (true? pred environ)
  (define (quote-symbols lst)
    (recursive-map (lambda (x)
		     (if (symbol? x) `(quote ,x) x))
		   lst))
  (eval (quote-symbols
	 (substitute-bindings pred environ))
	(interaction-environment)))
(define false? (negate true?))

;;; C should be a symbol representing a scheme
;;; condition (e.g. <, >, string=?, etc...)
(define (condition->c c port)
  (if (symbol? c)
      (condition->c
       (eval c (interaction-environment))
       port)
      (cond
       ((eq? c < )  (format port "<" ))
       ((eq? c <=)  (format port "<="))
       ((eq? c = )  (format port "=="))
       ((eq? c eq?) (format port "=="))
       ((eq? c >=)  (format port ">="))
       ((eq? c > )  (format port ">" )))))

;; Note: this works currently for standard numerical
;; comparison's, but could be extended for string
;; comparison by looking for `string=?' predicate
;; (and have the induction procedure generate such
;; predicates)
(define (predicate->c pred port)
  (define (ensure-string x)
    (if (symbol? x) (symbol->string x) x))
  (match pred
    ((condition left right)
     (format port
	     "if (~a ~a ~a)"
	     (ensure-string left)
	     (condition->c condition #f)
	     (ensure-string right)))
    ((true) ;XXX: are there other cases where there
	    ;might be a single symbol?
     (format port "else"))))

(define (predicate->record-filter g)
  (lambda (i vals) (true? g vals)))


;; =================================================
;;                Data Utilities
;; =================================================

;; Determine, on the whole, what label the entire
;; dataset should belong to.  Currently, the only
;; implemented method for doing this is to do a
;; majority vote.  If the dataset is empty, this
;; procedure returns #f.
(define (dataset-label ds)
  (let ((counts (label-counts ds)))
    (if (null? counts)
	#f
	(car (apply max* cdr counts)))))

(define (label-counts ds)
  (let ((score-board (make-hash-table)))
    (for-each
     (lambda (l)
       (hash-set! score-board l
		  (1+ (hash-ref score-board l 0))))
     (dataset-label-attribute-values ds))
    (hash-map->list cons score-board)))

(define default-measure (make-parameter 'entropy))

;; Calculates the impurity of a dataset, using some
;; measure [entropy,gini]
(define* (impurity ds #:optional (measure (default-measure)))
  (let ((l (dataset-length ds)))
    (if (> l 0)
	(let ((P (map (compose (cut / <> l) cdr) (label-counts ds))))
	  (cond ((eq? measure 'entropy)
		 (fold (lambda (p acc)
			 (if (= p 0) acc
			     (- acc (* p (log p)))))
		       0 P))
		((eq? measure 'gini)
		 (- 1 (fold (lambda (p acc) (+ acc (* p p))) 0 P)))))
	0)))

;; =================================================
;;               Decision-Tree Node
;; =================================================
;; 
;; The value of internal nodes could be used for
;; example, in an autotuning context, to initialize
;; and/or cache split attribute values.  To be
;; useful in such an application, internal values
;; should be written before the branches, but in
;; general the value could also be written following
;; the output of the branches.  For this purpose, a
;; user could pass a value to keyword arguments
;; #:pre-hook and #:post-hook, which could be either
;; strings, ports, or procedures.  If a procedure,
;; then of one argument which is the current node
;; being output.
(define-record-type tree
  (make-tree*
   value	    ;Some scheme value of procedure.

   branches	    ;list of (pred . child) pairs

   data	;The dataset. For leaf nodes, this is the
	;set of entries that made their way through
	;the decision tree before induction stopped.
	;For internal tree nodes, this is the data
	;before being split to it's branches.
   )
  tree?
  (value    tree-value*   set-tree-value!)
  (branches tree-branches)
  (data     tree-data))

(define (tree-children tree)
  (map cdr (tree-branches tree)))

(define* (make-tree #:key
		    (value #f)
		    (branches '())
		    (data #f))
  (make-tree* value branches data))

;;; Return the value associated with the decision
;;; tree node DT.  Uses a cached result if
;;; available, otherwise, calculates the node value
;;; based on the node's associated data.
(define* (tree-value tree #:optional (dflt #f))
  (or (tree-value* tree)
      ;; else set and return the value based on the
      ;; dataset label
      (let ((label (or (dataset-label (tree-data tree))
		       (if (procedure? dflt)
			   (dflt tree)
			   dflt))))
	(set-tree-value! tree label)
	label)))

;; Compute the information gain at this node
;;
;; Gain = I_p - sum( w_c * I_c )
;;
;; where w is the weight of the child, which is the
;; ratio of records at child c to the number of
;; records at the parent node, I_p is the impurity
;; if the given node, and I_c is the impurity of
;; child node c.
(define* (data-gain parent children)
  (let ((parent-size (dataset-length parent))
	(child-sizes (map dataset-length children))
	(parent-impurity (impurity parent))
	(child-impurities (map impurity children)))
    (- parent-impurity
       (fold (lambda (s w acc)
	       (+ acc (* (/ w parent-size) s)))
	     0
	     child-impurities
	     child-sizes))))

;;; Returns the information gain from going from
;;; this node to its children.  If DT is a leaf
;;; node, then this procedure simply returns the
;;; impurity if DT.
(define* (node-gain tree)
  (data-gain (tree-data tree)
	     (map tree-data
		  (tree-children tree))))

;;; Return the maximum depth of the decision tree
;;; TREE.
(define (tree-depth tree)
  (1+ (fold max 0 (map tree-depth
		       (tree-children tree)))))

;; The number of leaf nodes in a decision-tree
(define (tree-width tree)
  (if (leaf? tree)
      1
      (fold + 0 (map tree-width
		     (tree-children tree)))))

;; Is this tree node a leaf node?
(define (leaf? tree)
  (null? (tree-branches tree)))
;; Is this tree node an internal node?
(define internal? (negate leaf?))

;; TODO: define alist-style methods for decision
;; trees which return the value had from following
;; the decisions and branches through the tree to a
;; branch
(define (tree-apply tree environ)
  (if (leaf? tree)
      ;; If we're at a leaf, the application simply
      ;; returns the node's value
      (tree-value tree)
      ;; Otherwise, we send the request to a child tree.
      (tree-apply (direct tree environ) environ)))
(define classify tree-apply)

;; Return a list of the scheme values at leaf nodes.
(define-public (leaf-values tree)
  (if (leaf? tree)
      (list (tree-value tree))
      (concatenate (map leaf-values
			(tree-children tree)))))

;; Direct flow from this tree node down one of its
;; branches to another tree node by returning the
;; first of TREE's children for which the guard
;; condition is true
(define (direct tree environ)
  (find (compose (cut true? <> environ) car)
	(tree-branches tree)))

;; Get the decision-tree node resulting from
;; following the branches.  With a single argument
;; this procedure returns the input dt node.  With
;; optional arguments, return the node from
;; following the indexed children branches.  For
;; example::
;;
;;  (branch test-tree 1 2)
;;
;; Gets the second child node of the first child
;; node of test-tree
(define (branch tree . rest)
  (if (null? rest) tree
      (let ((child (cdr (list-ref (tree-branches tree)
				  (car rest)))))
	(apply branch child (cdr rest)))))


;;; ================================================
;;;                Tree Splits
;;; ================================================

(define-record-type split
  (make-split*
   score
   predicates
   datasets)
  split?
  (score      split-score)
  (predicates split-predicates)
  (datasets   split-datasets))

;;; Create a split by applying DISSECTOR to DATASET.
;;; The attribute that DISSECTOR compares against is
;;; removed from sub-datasets if dissector has a
;;; length greater than 2.  The thought is that if
;;; DISSECTOR is a multi-way split, then there is no
;;; sense splitting on it later.
(define (make-split dataset dissector)
  (let* ((filter-preds (map predicate->record-filter
			    dissector))
	 (remove-attr? (> (length dissector) 2))
	 (subdata (dataset-partition-records
		   filter-preds dataset)))
    (make-split* (data-gain dataset subdata)
		 dissector
		 (if remove-attr?
		     ;; XXX: assume attribute name
		     ;; is always the second symbol
		     ;; in a predicate.
		     (let ((new-cols
			    (delete
			     (cadar dissector)
			     (dataset-attribute-indices
			      dataset))))
		       (map (cut make-dataset/shared
				 <> #:columns new-cols)
			    subdata))
		     subdata))))

;;; If no good splits are found, returns #f.
;;; Otherwise returns a split.
;;;
;;; The numbers of attributes in each DATA may be
;;; different from DS, depending on what split was
;;; performed (e.g. a multi-way split may mean that
;;; the attribute is no longer suitable for future
;;; splitting, so is simply removed).
(define (best-split dataset)
  (let ((candidate-dissectors
	 (concatenate (map (lambda (attr-idx)
			     (attribute-dissectors
			      (dataset-attribute dataset
						 attr-idx)
			      (dataset-attribute-values
			       dataset attr-idx)))
			   (dataset-attribute-indices
			    dataset #:with-label #f)))))
    ;; TODO: We need to evaluate the extremum in
    ;; parallel.  Bonus points if we could get
    ;; candidate-dissectors to be generated lazily.
    ;; (format #t "Have ~a candidate dissectors for dataset of length ~a\n"
    ;; 	    (length candidate-dissectors)
    ;; 	    (dataset-length dataset))
    (and (not (null? candidate-dissectors))
	 (receive (dissector split)
		  (apply par-extremum+value*
			 (cut make-split dataset <>)
			 (lambda (s1 s2) (> (split-score s1)
					    (split-score s2)))
			 candidate-dissectors)
		  split))))

;;; ==================================================
;;;      Decision Tree Induction (the hard stuff)
;;; ==================================================

(define-public (pure-data? ds)
  (= (length (delete-duplicates
	      (dataset-label-attribute-values ds)))
     1))

(define-public default-min-dataset-size (make-parameter 2))
(define-public (small-data? ds)
  (< (dataset-length ds) (default-min-dataset-size)))

;;; Return #t if there are no more attributes to
;;; split on, or if all records have the same
;;; attribute values
(define-public (no-split-attrs? ds)
  (define (zero? n) (= n 0))
  (or (dataset-empty? ds)
      ;; Stops on the first attribute that has
      ;; non-unique values.  So, would most of the
      ;; time only scan one attribute.
      (every (lambda (attr-idx)
	       ((compose zero?
			 length
			 delete-duplicates
			 (cut dataset-attribute-values
			      ds <>))
		attr-idx))
	     (dataset-attribute-indices ds))))

;;; Apply some standard tests to data, as well as
;;; any predicates in PREDS, which should be
;;; procedures of arity 1: taking a dataset and
;;; returning non-#f if induction should stop.
(define (stop-induction? data . preds)
  (find (cut <> data)
	(append (list pure-data?
		      small-data?
		      no-split-attrs?)
		preds)))

;; Return a procedure which, when given a dataset,
;; returns #t if it is determined that this data is
;; "cheap" to leave as-is without further splitting,
;; as determined by the following:
;;
;; 1) determine the current label, L, of the data, D
;; 
;; 2) determine the "cost" of assigning each record,
;;    r, in D to class L, by looking up (r, L) in
;;    COST-DATA
;;    
;; 3) If the maximum value of all such costs is
;;    greater than COST-THRESHOLD, then return #f.
;;
;; A #t result can be interpreted as meaning that it
;; is not costly to assign the current label to D,
;; even if the data is otherwise relatively "impure".
;;
;; The 'case-weight' argument can take a few forms:
;;
;; - If a symbol, interpreted as the tag of an
;;   attribute in COST-DATA which contains weight
;;   numbers for each record
;;
;; - If a procedure, it should return a weight
;;   number when called with an record index.
;;   
(define* (cost-by-data cost-data
		       #:key
		       (cost-threshold 0.05)
		       (case-weight (const 1.0)))
  (let ((weight (if (procedure? case-weight)
		    case-weight
		    ;; else symbol
		    (lambda (i)
		      (dataset-ref cost-data i case-weight)))))
    (lambda (data)
      (let* ((label (dataset-label data))
	     (max-cost
	      (apply max
		     (map (lambda (r)
			    (let ((cost (dataset-ref cost-data
						     r label)))
			      (* (weight r) cost)))
			  (dataset-record-indices data)))))
	(< max-cost cost-threshold)))))

;;; Induce a decision tree classifier from DATA.
;;;
;;; The stop? keyword argument should be a procedure
;;; of arity 1 which accepts a dataset, and returns
;;; #t if induction should stop for the given data
;;; (i.e. the data forms a leaf node).
(define* (induce-decision-tree-classifier
	  data
	  #:key
	  (stop? stop-induction?)
	  (default-class 'null)
	  (tree-value (cut tree-value <> default-class)))
  (if (stop? data)
      (make-tree #:data data)
      (let ((next-split (best-split data)))
	(if (or (eq? next-split #f)
		(= (split-score next-split) 0))
	    (make-tree #:data data)
	    (let* ((branches
		    ;; TODO: This could most likely be
		    ;; parallelized.
		    (map (lambda (pred data)
			   (let* ((split-tree
				   (induce-decision-tree-classifier
				    data
				    #:stop? stop?
				    #:default-class default-class
				    #:tree-value tree-value)))
			     (cons pred split-tree)))
			 (split-predicates next-split)
			 (split-datasets   next-split)))
		   (branch-trees (map cdr branches)))
	      (if (and
		   (every leaf? branch-trees)
		   (= (length (delete-duplicates
			       (map  tree-value branch-trees)))
		      1))
		  ;; No reason to split of each
		  ;; branch has the same value.
		  (make-tree #:data data)
		  (make-tree
		   #:data data
		   #:branches branches)))))))

;;; ==================================================
;;;               Decision Tree Output
;;; ==================================================

;;; Create a port which prefixes every line printed
;;; to PORT with the string PRE.
(define (prefixing-port port pre)
  (let ((at-newline #t))
    (make-soft-port
     (vector
      (lambda (c)
	(if (eq? c #\newline)
	    (begin
	      (write-char c port)
	      (set! at-newline #t))
	    (begin
	      (when at-newline
		    (display pre port)
		    (set! at-newline #f))
	      (write-char c port))))
      (lambda (s)
	(begin
	  (when at-newline
		(display pre port)
		(set! at-newline #f))
	  ;; Only prefix newlines that are followed by some
	  ;; other characters on that line.
	  (regexp-substitute/global port "(\n)([^\n]+)" s
				    'pre 1 pre 2 'post)
	  (if (string=? (string-take-right s 1) "\n")
	      (set! at-newline #t))))
      (lambda () (force-output port))
      #f
      (lambda () (close-port port))
      #f)
     "w")))

(define (comment->c val)
  (let ((sv (as-string val)))
   (unless (string-null? sv)
	   (format #f "/* ~a */" sv))))

(define* (tree->c tree
		  #:optional
		  (out-port (current-output-port))
		  #:key
		  (pre-hook (compose comment->c tree-value))
		  (leaf-hook (compose as-string tree-value))
		  (post-hook (const "")))
  (let ((port (cond ((port? out-port) out-port)
		    ((eq? out-port #t) (current-output-port))
		    ((eq? out-port #f) (open-output-string))))
	(pre (pre-hook tree))
	(post (post-hook tree)))
    (begin
      (unless (or (eq? pre #f) (string-null? pre))
	      (format port "~a" pre))
      (when (leaf? tree)
	    (format port "~a" (leaf-hook tree)))
      (for-each
       (lambda (elem)
	 (begin
	   (predicate->c (car elem) port)
	   (format port " {\n")
	   (tree->c (cdr elem)
		    (prefixing-port port "  ")
		    #:pre-hook pre-hook
		    #:leaf-hook leaf-hook
		    #:post-hook post-hook)
	   (format port "} ")))
       (tree-branches tree))
      (format port "\n")
      (unless (string-null? post)
	      (format port "~a\n" post))
      (when (eq? out-port #f)
	    (get-output-string port)))))

;;; ================================================
;;;                   Tests
;;; ================================================

(use-modules (srfi srfi-64)
	     (data-mining test-util))

(test-begin "decision-tree-test")

;;; Check predicates
(test-assert (true?  `(,< 5 10) '()))
(test-assert (false? `(,< 10 5) '()))
(test-assert (true?  `(,< a 10) '((a . 5))))
(test-assert (false? `(,< a 10) '((a . 12))))

;;; Need some test data for the following tests
(define test-data
  (let ((input "
rec,foo,bar,class
0,0.1,t,a
1,0.12,t,a
2,0.09,n,a
3,0.21,n,b
4,0.18,t,b
5,0.11,n,a
6,0.121,n,a
7,0.23,t,b
8,0.04,t,c")
	(attributes `(,(make-nominal-attribute)
		      ,(make-numeric-attribute)
		      ,(make-nominal-attribute
			#:read-value string->symbol)
		      ,(make-nominal-attribute))))
    (delimited->dataset attributes
			'class
			(open-input-string input)
			#:rec-idx 'rec)))
(test-eq 2 (dataset-width test-data))
(test-assert "test data columns"
	     ((list-permutation? '(foo bar class))
	      (dataset-attribute-indices test-data)))

;;; Check predicate->record-filter
(define filter (predicate->record-filter `(,< foo 0.12)))
;;; Use that filter and make sure we get the correct
;;; number of records.
(define ds/f (dataset-filter test-data
			     #:record-pred filter))
(test-eq "filtered length" 4 (dataset-length ds/f))
(test-assert "filtered indices"
	     ((list-permutation? (map string->symbol
				      '("0" "2" "5" "8")))
	      (dataset-record-indices ds/f)))

;;; Check dataset-label
(test-equal "dataset label" "a" (dataset-label test-data))

;;; Check label-counts
(test-assert "label-counts"
	     (same-map? '(("a" . 5) ("b" . 3) ("c" . 1))
			(label-counts test-data)))

;;; Check induce-decision-tree-classifier

(define test-tree
  (induce-decision-tree-classifier test-data))

(format #t "depth: ~a\n" (tree-depth test-tree))
(format #t "width: ~a\n" (tree-width test-tree))
(tree->c test-tree
	 #:pre-hook
	 (lambda (tree)
	   (let ((str-port (open-output-string)))
	     (dataset->delimited
	      (tree-data tree) str-port)
	     (get-output-string str-port))))

(test-end "decision-tree-test")


;;; Local Variables:
;;; fill-column: 52
;;; End: