decision-trees.scm 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702
  1. (define-module (data-mining classification decision-trees)
  2. #:use-module (data-mining dataset)
  3. #:use-module (data-mining attributes)
  4. #:use-module (srfi srfi-1)
  5. #:use-module (srfi srfi-9) ;define-record-type
  6. #:use-module (data-mining util) ;(list-split substitute-bindings successive-apply)
  7. #:use-module (data-mining type-conversions) ;(as-string)
  8. #:use-module (srfi srfi-26)
  9. #:use-module (ice-9 format)
  10. #:use-module (ice-9 match)
  11. #:use-module (ice-9 receive)
  12. #:use-module (ice-9 threads) ;(par-map current-processor-count)
  13. #:use-module (ice-9 regex) ;(regexp-substitute/global)
  14. #:export (make-tree
  15. tree->c
  16. tree-data
  17. tree-value
  18. tree-children
  19. tree-depth
  20. tree-width
  21. leaf?
  22. internal?
  23. tree-apply
  24. classify
  25. leaf-values
  26. true?
  27. false?
  28. condition->c
  29. predicate->c
  30. predicate->record-filter
  31. dataset-label
  32. label-counts
  33. impurity
  34. gain
  35. stop-induction?
  36. cost-by-data
  37. induce-decision-tree-classifier
  38. default-measure
  39. default-min-dataset-size
  40. prefixing-port))
  41. (define author "Eric Bavier <bavier@member.fsf.org")
  42. (define date "Thu Apr 4 16:56:57 CDT 2013 ")
  43. (define copyright "GPLv3+")
  44. ;; =================================================
  45. ;; Condition class
  46. ;; =================================================
  47. ;;
  48. ;; Each branch of a decision tree will have an
  49. ;; associated condition which if true directs flow
  50. ;; down that branch. To determine the truth of a
  51. ;; <cond> the procedure PREDICATE is applied to
  52. ;; ARGS.
  53. ;;
  54. ;; Note: The reason we define this class instead of
  55. ;; just having each branch hold onto a procedure is
  56. ;; for output purposes. It's easier to determine
  57. ;; how to map a condition to a C statement if we
  58. ;; have direct access to the predicate.
  59. ;; (define-class <cond> ()
  60. ;; (predicate #:init-value (const #t)
  61. ;; #:init-keyword #:predicate)
  62. ;; (args #:init-value '()
  63. ;; #:init-keyword #:args))
  64. ;; (define-class <else> (<cond>))
  65. ;; Evaluate the <cond> C for some set of value
  66. ;; bindings established via the alist ENVIRON. If
  67. ;; any of the condition arguments is a symbol which
  68. ;; is in the set of keys for ENVIRON, then the
  69. ;; corresponding alist value is substituted before
  70. ;; evaluation. Symbols which do not contain an
  71. ;; associated value are left as-is for the
  72. ;; evaluation.
  73. ;; (define-method (true? (c <cond>) environ)
  74. ;; (apply (slot-ref c 'predicate)
  75. ;; (substitute-bindings (slot-ref c 'args)
  76. ;; environ)))
  77. ;; (define-method (true? (e <else>) environ) #t)
  78. ;; (define-method (false? (c <cond>) environ)
  79. ;; (not (true? c environ)))
  80. ;;; XXX: These simplified condition/predicate tests
  81. ;;; should suffice for us. No need to define extra
  82. ;;; classes.
  83. (define (true? pred environ)
  84. (define (quote-symbols lst)
  85. (recursive-map (lambda (x)
  86. (if (symbol? x) `(quote ,x) x))
  87. lst))
  88. (eval (quote-symbols
  89. (substitute-bindings pred environ))
  90. (interaction-environment)))
  91. (define false? (negate true?))
  92. ;;; C should be a symbol representing a scheme
  93. ;;; condition (e.g. <, >, string=?, etc...)
  94. (define (condition->c c port)
  95. (if (symbol? c)
  96. (condition->c
  97. (eval c (interaction-environment))
  98. port)
  99. (cond
  100. ((eq? c < ) (format port "<" ))
  101. ((eq? c <=) (format port "<="))
  102. ((eq? c = ) (format port "=="))
  103. ((eq? c eq?) (format port "=="))
  104. ((eq? c >=) (format port ">="))
  105. ((eq? c > ) (format port ">" )))))
  106. ;; Note: this works currently for standard numerical
  107. ;; comparison's, but could be extended for string
  108. ;; comparison by looking for `string=?' predicate
  109. ;; (and have the induction procedure generate such
  110. ;; predicates)
  111. (define (predicate->c pred port)
  112. (define (ensure-string x)
  113. (if (symbol? x) (symbol->string x) x))
  114. (match pred
  115. ((condition left right)
  116. (format port
  117. "if (~a ~a ~a)"
  118. (ensure-string left)
  119. (condition->c condition #f)
  120. (ensure-string right)))
  121. ((true) ;XXX: are there other cases where there
  122. ;might be a single symbol?
  123. (format port "else"))))
  124. (define (predicate->record-filter g)
  125. (lambda (i vals) (true? g vals)))
  126. ;; =================================================
  127. ;; Data Utilities
  128. ;; =================================================
  129. ;; Determine, on the whole, what label the entire
  130. ;; dataset should belong to. Currently, the only
  131. ;; implemented method for doing this is to do a
  132. ;; majority vote. If the dataset is empty, this
  133. ;; procedure returns #f.
  134. (define (dataset-label ds)
  135. (let ((counts (label-counts ds)))
  136. (if (null? counts)
  137. #f
  138. (car (apply max* cdr counts)))))
  139. (define (label-counts ds)
  140. (let ((score-board (make-hash-table)))
  141. (for-each
  142. (lambda (l)
  143. (hash-set! score-board l
  144. (1+ (hash-ref score-board l 0))))
  145. (dataset-label-attribute-values ds))
  146. (hash-map->list cons score-board)))
  147. (define default-measure (make-parameter 'entropy))
  148. ;; Calculates the impurity of a dataset, using some
  149. ;; measure [entropy,gini]
  150. (define* (impurity ds #:optional (measure (default-measure)))
  151. (let ((l (dataset-length ds)))
  152. (if (> l 0)
  153. (let ((P (map (compose (cut / <> l) cdr) (label-counts ds))))
  154. (cond ((eq? measure 'entropy)
  155. (fold (lambda (p acc)
  156. (if (= p 0) acc
  157. (- acc (* p (log p)))))
  158. 0 P))
  159. ((eq? measure 'gini)
  160. (- 1 (fold (lambda (p acc) (+ acc (* p p))) 0 P)))))
  161. 0)))
  162. ;; =================================================
  163. ;; Decision-Tree Node
  164. ;; =================================================
  165. ;;
  166. ;; The value of internal nodes could be used for
  167. ;; example, in an autotuning context, to initialize
  168. ;; and/or cache split attribute values. To be
  169. ;; useful in such an application, internal values
  170. ;; should be written before the branches, but in
  171. ;; general the value could also be written following
  172. ;; the output of the branches. For this purpose, a
  173. ;; user could pass a value to keyword arguments
  174. ;; #:pre-hook and #:post-hook, which could be either
  175. ;; strings, ports, or procedures. If a procedure,
  176. ;; then of one argument which is the current node
  177. ;; being output.
  178. (define-record-type tree
  179. (make-tree*
  180. value ;Some scheme value of procedure.
  181. branches ;list of (pred . child) pairs
  182. data ;The dataset. For leaf nodes, this is the
  183. ;set of entries that made their way through
  184. ;the decision tree before induction stopped.
  185. ;For internal tree nodes, this is the data
  186. ;before being split to it's branches.
  187. )
  188. tree?
  189. (value tree-value* set-tree-value!)
  190. (branches tree-branches)
  191. (data tree-data))
  192. (define (tree-children tree)
  193. (map cdr (tree-branches tree)))
  194. (define* (make-tree #:key
  195. (value #f)
  196. (branches '())
  197. (data #f))
  198. (make-tree* value branches data))
  199. ;;; Return the value associated with the decision
  200. ;;; tree node DT. Uses a cached result if
  201. ;;; available, otherwise, calculates the node value
  202. ;;; based on the node's associated data.
  203. (define* (tree-value tree #:optional (dflt #f))
  204. (or (tree-value* tree)
  205. ;; else set and return the value based on the
  206. ;; dataset label
  207. (let ((label (or (dataset-label (tree-data tree))
  208. (if (procedure? dflt)
  209. (dflt tree)
  210. dflt))))
  211. (set-tree-value! tree label)
  212. label)))
  213. ;; Compute the information gain at this node
  214. ;;
  215. ;; Gain = I_p - sum( w_c * I_c )
  216. ;;
  217. ;; where w is the weight of the child, which is the
  218. ;; ratio of records at child c to the number of
  219. ;; records at the parent node, I_p is the impurity
  220. ;; if the given node, and I_c is the impurity of
  221. ;; child node c.
  222. (define* (data-gain parent children)
  223. (let ((parent-size (dataset-length parent))
  224. (child-sizes (map dataset-length children))
  225. (parent-impurity (impurity parent))
  226. (child-impurities (map impurity children)))
  227. (- parent-impurity
  228. (fold (lambda (s w acc)
  229. (+ acc (* (/ w parent-size) s)))
  230. 0
  231. child-impurities
  232. child-sizes))))
  233. ;;; Returns the information gain from going from
  234. ;;; this node to its children. If DT is a leaf
  235. ;;; node, then this procedure simply returns the
  236. ;;; impurity if DT.
  237. (define* (node-gain tree)
  238. (data-gain (tree-data tree)
  239. (map tree-data
  240. (tree-children tree))))
  241. ;;; Return the maximum depth of the decision tree
  242. ;;; TREE.
  243. (define (tree-depth tree)
  244. (1+ (fold max 0 (map tree-depth
  245. (tree-children tree)))))
  246. ;; The number of leaf nodes in a decision-tree
  247. (define (tree-width tree)
  248. (if (leaf? tree)
  249. 1
  250. (fold + 0 (map tree-width
  251. (tree-children tree)))))
  252. ;; Is this tree node a leaf node?
  253. (define (leaf? tree)
  254. (null? (tree-branches tree)))
  255. ;; Is this tree node an internal node?
  256. (define internal? (negate leaf?))
  257. ;; TODO: define alist-style methods for decision
  258. ;; trees which return the value had from following
  259. ;; the decisions and branches through the tree to a
  260. ;; branch
  261. (define (tree-apply tree environ)
  262. (if (leaf? tree)
  263. ;; If we're at a leaf, the application simply
  264. ;; returns the node's value
  265. (tree-value tree)
  266. ;; Otherwise, we send the request to a child tree.
  267. (tree-apply (direct tree environ) environ)))
  268. (define classify tree-apply)
  269. ;; Return a list of the scheme values at leaf nodes.
  270. (define-public (leaf-values tree)
  271. (if (leaf? tree)
  272. (list (tree-value tree))
  273. (concatenate (map leaf-values
  274. (tree-children tree)))))
  275. ;; Direct flow from this tree node down one of its
  276. ;; branches to another tree node by returning the
  277. ;; first of TREE's children for which the guard
  278. ;; condition is true
  279. (define (direct tree environ)
  280. (find (compose (cut true? <> environ) car)
  281. (tree-branches tree)))
  282. ;; Get the decision-tree node resulting from
  283. ;; following the branches. With a single argument
  284. ;; this procedure returns the input dt node. With
  285. ;; optional arguments, return the node from
  286. ;; following the indexed children branches. For
  287. ;; example::
  288. ;;
  289. ;; (branch test-tree 1 2)
  290. ;;
  291. ;; Gets the second child node of the first child
  292. ;; node of test-tree
  293. (define (branch tree . rest)
  294. (if (null? rest) tree
  295. (let ((child (cdr (list-ref (tree-branches tree)
  296. (car rest)))))
  297. (apply branch child (cdr rest)))))
  298. ;;; ================================================
  299. ;;; Tree Splits
  300. ;;; ================================================
  301. (define-record-type split
  302. (make-split*
  303. score
  304. predicates
  305. datasets)
  306. split?
  307. (score split-score)
  308. (predicates split-predicates)
  309. (datasets split-datasets))
  310. ;;; Create a split by applying DISSECTOR to DATASET.
  311. ;;; The attribute that DISSECTOR compares against is
  312. ;;; removed from sub-datasets if dissector has a
  313. ;;; length greater than 2. The thought is that if
  314. ;;; DISSECTOR is a multi-way split, then there is no
  315. ;;; sense splitting on it later.
  316. (define (make-split dataset dissector)
  317. (let* ((filter-preds (map predicate->record-filter
  318. dissector))
  319. (remove-attr? (> (length dissector) 2))
  320. (subdata (dataset-partition-records
  321. filter-preds dataset)))
  322. (make-split* (data-gain dataset subdata)
  323. dissector
  324. (if remove-attr?
  325. ;; XXX: assume attribute name
  326. ;; is always the second symbol
  327. ;; in a predicate.
  328. (let ((new-cols
  329. (delete
  330. (cadar dissector)
  331. (dataset-attribute-indices
  332. dataset))))
  333. (map (cut make-dataset/shared
  334. <> #:columns new-cols)
  335. subdata))
  336. subdata))))
  337. ;;; If no good splits are found, returns #f.
  338. ;;; Otherwise returns a split.
  339. ;;;
  340. ;;; The numbers of attributes in each DATA may be
  341. ;;; different from DS, depending on what split was
  342. ;;; performed (e.g. a multi-way split may mean that
  343. ;;; the attribute is no longer suitable for future
  344. ;;; splitting, so is simply removed).
  345. (define (best-split dataset)
  346. (let ((candidate-dissectors
  347. (concatenate (map (lambda (attr-idx)
  348. (attribute-dissectors
  349. (dataset-attribute dataset
  350. attr-idx)
  351. (dataset-attribute-values
  352. dataset attr-idx)))
  353. (dataset-attribute-indices
  354. dataset #:with-label #f)))))
  355. ;; TODO: We need to evaluate the extremum in
  356. ;; parallel. Bonus points if we could get
  357. ;; candidate-dissectors to be generated lazily.
  358. ;; (format #t "Have ~a candidate dissectors for dataset of length ~a\n"
  359. ;; (length candidate-dissectors)
  360. ;; (dataset-length dataset))
  361. (and (not (null? candidate-dissectors))
  362. (receive (dissector split)
  363. (apply par-extremum+value*
  364. (cut make-split dataset <>)
  365. (lambda (s1 s2) (> (split-score s1)
  366. (split-score s2)))
  367. candidate-dissectors)
  368. split))))
  369. ;;; ==================================================
  370. ;;; Decision Tree Induction (the hard stuff)
  371. ;;; ==================================================
  372. (define-public (pure-data? ds)
  373. (= (length (delete-duplicates
  374. (dataset-label-attribute-values ds)))
  375. 1))
  376. (define-public default-min-dataset-size (make-parameter 2))
  377. (define-public (small-data? ds)
  378. (< (dataset-length ds) (default-min-dataset-size)))
  379. ;;; Return #t if there are no more attributes to
  380. ;;; split on, or if all records have the same
  381. ;;; attribute values
  382. (define-public (no-split-attrs? ds)
  383. (define (zero? n) (= n 0))
  384. (or (dataset-empty? ds)
  385. ;; Stops on the first attribute that has
  386. ;; non-unique values. So, would most of the
  387. ;; time only scan one attribute.
  388. (every (lambda (attr-idx)
  389. ((compose zero?
  390. length
  391. delete-duplicates
  392. (cut dataset-attribute-values
  393. ds <>))
  394. attr-idx))
  395. (dataset-attribute-indices ds))))
  396. ;;; Apply some standard tests to data, as well as
  397. ;;; any predicates in PREDS, which should be
  398. ;;; procedures of arity 1: taking a dataset and
  399. ;;; returning non-#f if induction should stop.
  400. (define (stop-induction? data . preds)
  401. (find (cut <> data)
  402. (append (list pure-data?
  403. small-data?
  404. no-split-attrs?)
  405. preds)))
  406. ;; Return a procedure which, when given a dataset,
  407. ;; returns #t if it is determined that this data is
  408. ;; "cheap" to leave as-is without further splitting,
  409. ;; as determined by the following:
  410. ;;
  411. ;; 1) determine the current label, L, of the data, D
  412. ;;
  413. ;; 2) determine the "cost" of assigning each record,
  414. ;; r, in D to class L, by looking up (r, L) in
  415. ;; COST-DATA
  416. ;;
  417. ;; 3) If the maximum value of all such costs is
  418. ;; greater than COST-THRESHOLD, then return #f.
  419. ;;
  420. ;; A #t result can be interpreted as meaning that it
  421. ;; is not costly to assign the current label to D,
  422. ;; even if the data is otherwise relatively "impure".
  423. ;;
  424. ;; The 'case-weight' argument can take a few forms:
  425. ;;
  426. ;; - If a symbol, interpreted as the tag of an
  427. ;; attribute in COST-DATA which contains weight
  428. ;; numbers for each record
  429. ;;
  430. ;; - If a procedure, it should return a weight
  431. ;; number when called with an record index.
  432. ;;
  433. (define* (cost-by-data cost-data
  434. #:key
  435. (cost-threshold 0.05)
  436. (case-weight (const 1.0)))
  437. (let ((weight (if (procedure? case-weight)
  438. case-weight
  439. ;; else symbol
  440. (lambda (i)
  441. (dataset-ref cost-data i case-weight)))))
  442. (lambda (data)
  443. (let* ((label (dataset-label data))
  444. (max-cost
  445. (apply max
  446. (map (lambda (r)
  447. (let ((cost (dataset-ref cost-data
  448. r label)))
  449. (* (weight r) cost)))
  450. (dataset-record-indices data)))))
  451. (< max-cost cost-threshold)))))
  452. ;;; Induce a decision tree classifier from DATA.
  453. ;;;
  454. ;;; The stop? keyword argument should be a procedure
  455. ;;; of arity 1 which accepts a dataset, and returns
  456. ;;; #t if induction should stop for the given data
  457. ;;; (i.e. the data forms a leaf node).
  458. (define* (induce-decision-tree-classifier
  459. data
  460. #:key
  461. (stop? stop-induction?)
  462. (default-class 'null)
  463. (tree-value (cut tree-value <> default-class)))
  464. (if (stop? data)
  465. (make-tree #:data data)
  466. (let ((next-split (best-split data)))
  467. (if (or (eq? next-split #f)
  468. (= (split-score next-split) 0))
  469. (make-tree #:data data)
  470. (let* ((branches
  471. ;; TODO: This could most likely be
  472. ;; parallelized.
  473. (map (lambda (pred data)
  474. (let* ((split-tree
  475. (induce-decision-tree-classifier
  476. data
  477. #:stop? stop?
  478. #:default-class default-class
  479. #:tree-value tree-value)))
  480. (cons pred split-tree)))
  481. (split-predicates next-split)
  482. (split-datasets next-split)))
  483. (branch-trees (map cdr branches)))
  484. (if (and
  485. (every leaf? branch-trees)
  486. (= (length (delete-duplicates
  487. (map tree-value branch-trees)))
  488. 1))
  489. ;; No reason to split of each
  490. ;; branch has the same value.
  491. (make-tree #:data data)
  492. (make-tree
  493. #:data data
  494. #:branches branches)))))))
  495. ;;; ==================================================
  496. ;;; Decision Tree Output
  497. ;;; ==================================================
  498. ;;; Create a port which prefixes every line printed
  499. ;;; to PORT with the string PRE.
  500. (define (prefixing-port port pre)
  501. (let ((at-newline #t))
  502. (make-soft-port
  503. (vector
  504. (lambda (c)
  505. (if (eq? c #\newline)
  506. (begin
  507. (write-char c port)
  508. (set! at-newline #t))
  509. (begin
  510. (when at-newline
  511. (display pre port)
  512. (set! at-newline #f))
  513. (write-char c port))))
  514. (lambda (s)
  515. (begin
  516. (when at-newline
  517. (display pre port)
  518. (set! at-newline #f))
  519. ;; Only prefix newlines that are followed by some
  520. ;; other characters on that line.
  521. (regexp-substitute/global port "(\n)([^\n]+)" s
  522. 'pre 1 pre 2 'post)
  523. (if (string=? (string-take-right s 1) "\n")
  524. (set! at-newline #t))))
  525. (lambda () (force-output port))
  526. #f
  527. (lambda () (close-port port))
  528. #f)
  529. "w")))
  530. (define (comment->c val)
  531. (let ((sv (as-string val)))
  532. (unless (string-null? sv)
  533. (format #f "/* ~a */" sv))))
  534. (define* (tree->c tree
  535. #:optional
  536. (out-port (current-output-port))
  537. #:key
  538. (pre-hook (compose comment->c tree-value))
  539. (leaf-hook (compose as-string tree-value))
  540. (post-hook (const "")))
  541. (let ((port (cond ((port? out-port) out-port)
  542. ((eq? out-port #t) (current-output-port))
  543. ((eq? out-port #f) (open-output-string))))
  544. (pre (pre-hook tree))
  545. (post (post-hook tree)))
  546. (begin
  547. (unless (or (eq? pre #f) (string-null? pre))
  548. (format port "~a" pre))
  549. (when (leaf? tree)
  550. (format port "~a" (leaf-hook tree)))
  551. (for-each
  552. (lambda (elem)
  553. (begin
  554. (predicate->c (car elem) port)
  555. (format port " {\n")
  556. (tree->c (cdr elem)
  557. (prefixing-port port " ")
  558. #:pre-hook pre-hook
  559. #:leaf-hook leaf-hook
  560. #:post-hook post-hook)
  561. (format port "} ")))
  562. (tree-branches tree))
  563. (format port "\n")
  564. (unless (string-null? post)
  565. (format port "~a\n" post))
  566. (when (eq? out-port #f)
  567. (get-output-string port)))))
  568. ;;; ================================================
  569. ;;; Tests
  570. ;;; ================================================
  571. (use-modules (srfi srfi-64)
  572. (data-mining test-util))
  573. (test-begin "decision-tree-test")
  574. ;;; Check predicates
  575. (test-assert (true? `(,< 5 10) '()))
  576. (test-assert (false? `(,< 10 5) '()))
  577. (test-assert (true? `(,< a 10) '((a . 5))))
  578. (test-assert (false? `(,< a 10) '((a . 12))))
  579. ;;; Need some test data for the following tests
  580. (define test-data
  581. (let ((input "
  582. rec,foo,bar,class
  583. 0,0.1,t,a
  584. 1,0.12,t,a
  585. 2,0.09,n,a
  586. 3,0.21,n,b
  587. 4,0.18,t,b
  588. 5,0.11,n,a
  589. 6,0.121,n,a
  590. 7,0.23,t,b
  591. 8,0.04,t,c")
  592. (attributes `(,(make-nominal-attribute)
  593. ,(make-numeric-attribute)
  594. ,(make-nominal-attribute
  595. #:read-value string->symbol)
  596. ,(make-nominal-attribute))))
  597. (delimited->dataset attributes
  598. 'class
  599. (open-input-string input)
  600. #:rec-idx 'rec)))
  601. (test-eq 2 (dataset-width test-data))
  602. (test-assert "test data columns"
  603. ((list-permutation? '(foo bar class))
  604. (dataset-attribute-indices test-data)))
  605. ;;; Check predicate->record-filter
  606. (define filter (predicate->record-filter `(,< foo 0.12)))
  607. ;;; Use that filter and make sure we get the correct
  608. ;;; number of records.
  609. (define ds/f (dataset-filter test-data
  610. #:record-pred filter))
  611. (test-eq "filtered length" 4 (dataset-length ds/f))
  612. (test-assert "filtered indices"
  613. ((list-permutation? (map string->symbol
  614. '("0" "2" "5" "8")))
  615. (dataset-record-indices ds/f)))
  616. ;;; Check dataset-label
  617. (test-equal "dataset label" "a" (dataset-label test-data))
  618. ;;; Check label-counts
  619. (test-assert "label-counts"
  620. (same-map? '(("a" . 5) ("b" . 3) ("c" . 1))
  621. (label-counts test-data)))
  622. ;;; Check induce-decision-tree-classifier
  623. (define test-tree
  624. (induce-decision-tree-classifier test-data))
  625. (format #t "depth: ~a\n" (tree-depth test-tree))
  626. (format #t "width: ~a\n" (tree-width test-tree))
  627. (tree->c test-tree
  628. #:pre-hook
  629. (lambda (tree)
  630. (let ((str-port (open-output-string)))
  631. (dataset->delimited
  632. (tree-data tree) str-port)
  633. (get-output-string str-port))))
  634. (test-end "decision-tree-test")
  635. ;;; Local Variables:
  636. ;;; fill-column: 52
  637. ;;; End: