decision-tree-test.scm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. (use-modules
  2. ;; SRFI 64 for unit testing facilities
  3. (srfi srfi-64)
  4. ;; SRFI 8 for `receive` form
  5. (srfi srfi-8)
  6. ;; utils - the code to be tested
  7. (decision-tree)
  8. ;; Utilities for testing
  9. (utils test)
  10. ;; Dependencies for testing the code to be tested
  11. (dataset)
  12. (metrics)
  13. (pruning)
  14. (prediction)
  15. (data-point)
  16. (tree))
  17. (define TEST-DATA
  18. (list #(2.771244718 1.784783929 0)
  19. #(1.728571309 1.169761413 0)
  20. #(3.678319846 2.81281357 0)
  21. #(3.961043357 2.61995032 0)
  22. #(2.999208922 2.209014212 0)
  23. #(7.497545867 3.162953546 1)
  24. #(9.00220326 3.339047188 1)
  25. #(7.444542326 0.476683375 1)
  26. #(10.12493903 3.234550982 1)
  27. #(6.642287351 3.319983761 1)))
  28. (define PRECISION (expt 10 -9))
  29. (test-begin "decision-tree-test")
  30. (test-group
  31. "split-data"
  32. ;; split-data does not split correctly
  33. (test-equal "split-data-1"
  34. (list (list #(1.0 1.0 1.0 1.0 0)
  35. #(1.2 1.0 1.0 1.0 0)
  36. #(1.4 1.0 1.0 1.0 0))
  37. (list #(1.6 1.0 1.0 1.0 0)
  38. #(1.8 1.0 1.0 1.0 0)
  39. #(2.0 1.0 1.0 1.0 0)))
  40. (split-data (list #(1.0 1.0 1.0 1.0 0)
  41. #(1.2 1.0 1.0 1.0 0)
  42. #(1.4 1.0 1.0 1.0 0)
  43. #(1.6 1.0 1.0 1.0 0)
  44. #(1.8 1.0 1.0 1.0 0)
  45. #(2.0 1.0 1.0 1.0 0))
  46. 0
  47. 1.5))
  48. ;; "split-data does not split correctly"
  49. (test-equal "split-data-2"
  50. (list (list #(1.0 1.0 1.0 1.0 0)
  51. #(1.4 1.0 1.0 1.0 0)
  52. #(1.8 1.0 1.0 1.0 0)
  53. #(2.0 2.0 1.0 1.0 0))
  54. (list #(1.2 4.0 1.0 1.0 0)
  55. #(1.6 3.0 1.0 1.0 0)))
  56. (split-data (list #(1.0 1.0 1.0 1.0 0)
  57. #(1.2 4.0 1.0 1.0 0)
  58. #(1.4 1.0 1.0 1.0 0)
  59. #(1.6 3.0 1.0 1.0 0)
  60. #(1.8 1.0 1.0 1.0 0)
  61. #(2.0 2.0 1.0 1.0 0))
  62. 1
  63. 2.5)))
  64. (test-group
  65. "get-best-split"
  66. (let ([test-data (list #(2.771244718 1.784783929 0)
  67. #(1.728571309 1.169761413 0)
  68. #(3.678319846 2.81281357 0)
  69. #(3.961043357 2.61995032 0)
  70. #(2.999208922 2.209014212 0)
  71. #(7.497545867 3.162953546 1)
  72. #(9.00220326 3.339047188 1)
  73. #(7.444542326 0.476683375 1)
  74. #(10.12493903 3.234550982 1)
  75. #(6.642287351 3.319983761 1))]
  76. [feature-column-indices (list 0 1)]
  77. [label-column-index 2])
  78. ;; get-best-split does not give the best split
  79. (test-equal "get-best-split-1"
  80. ;; In the left branch the values of the first feature are all lower than
  81. ;; the values of the the first feature in the right branch.
  82. ;; In the right branch there is a value for the second feature, which is
  83. ;; lower than the values for that feature in the left branch, but all other
  84. ;; values of the feature in the right branch are higher than the ones in
  85. ;; the left branch, which makes the second feature an imperfect split
  86. ;; feature.
  87. ;; This means, that the best split is the one on the first feature.
  88. (make-split 0
  89. 6.642287351
  90. (list
  91. ;; left branch data
  92. (list #(2.771244718 1.784783929 0)
  93. #(1.728571309 1.169761413 0)
  94. #(3.678319846 2.81281357 0)
  95. #(3.961043357 2.61995032 0)
  96. #(2.999208922 2.209014212 0))
  97. ;; right branch data
  98. (list #(7.497545867 3.162953546 1)
  99. #(9.00220326 3.339047188 1)
  100. #(7.444542326 0.476683375 1)
  101. #(10.12493903 3.234550982 1)
  102. #(6.642287351 3.319983761 1)))
  103. 0.0)
  104. (get-best-split test-data
  105. feature-column-indices
  106. label-column-index))))
  107. (test-group
  108. "fit"
  109. (let ([test-data (list #(1.0 1.0 0)
  110. #(1.2 1.0 0)
  111. #(1.1 1.0 0)
  112. #(1.4 1.0 0)
  113. #(1.2 1.0 0)
  114. #(1.2 1.0 0) ;;
  115. #(2.3 1.0 1)
  116. #(2.0 1.0 1)
  117. #(2.3 1.0 1)
  118. #(2.0 1.0 1)
  119. #(2.3 1.0 1)
  120. #(2.0 1.0 1)
  121. #(2.4 1.0 1))]
  122. [feature-column-indices (list 0 1)]
  123. [label-column-index 2])
  124. (test-equal
  125. (let ([best-split (get-best-split test-data (list 0 1) 2)])
  126. (make-node test-data
  127. (split-feature-index best-split)
  128. (split-value best-split)
  129. (make-leaf-node (list #(1.0 1.0 0)
  130. #(1.2 1.0 0)
  131. #(1.1 1.0 0)
  132. #(1.4 1.0 0)
  133. #(1.2 1.0 0)
  134. #(1.2 1.0 0)))
  135. (make-leaf-node (list #(2.3 1.0 1)
  136. #(2.0 1.0 1)
  137. #(2.3 1.0 1)
  138. #(2.0 1.0 1)
  139. #(2.3 1.0 1)
  140. #(2.0 1.0 1)
  141. #(2.4 1.0 1)))))
  142. (fit #:train-data test-data
  143. #:feature-column-indices (list 0 1)
  144. #:label-column-index 2
  145. #:max-depth 2
  146. #:min-data-points 4
  147. #:min-data-points-ratio 0.02)))
  148. (let* ([test-data (list #(1.0 1.0 0)
  149. #(1.2 1.0 0)
  150. #(1.1 1.0 0)
  151. #(1.4 1.0 0)
  152. #(1.2 1.0 0)
  153. #(1.2 1.0 0) ;;
  154. #(2.3 1.1 0)
  155. #(2.0 1.1 0)
  156. #(2.3 1.0 1)
  157. #(2.0 1.0 1)
  158. #(2.3 1.0 1)
  159. #(2.0 1.0 1)
  160. #(2.4 1.0 1))]
  161. [best-split (get-best-split test-data (list 0 1) 2)])
  162. (test-equal
  163. (make-node test-data
  164. (split-feature-index best-split)
  165. (split-value best-split)
  166. (make-leaf-node (list #(1.0 1.0 0)
  167. #(1.2 1.0 0)
  168. #(1.1 1.0 0)
  169. #(1.4 1.0 0)
  170. #(1.2 1.0 0)
  171. #(1.2 1.0 0)))
  172. (let* ([subset (list #(2.3 1.1 0)
  173. #(2.0 1.1 0)
  174. #(2.3 1.0 1)
  175. #(2.0 1.0 1)
  176. #(2.3 1.0 1)
  177. #(2.0 1.0 1)
  178. #(2.4 1.0 1))]
  179. [best-split (get-best-split subset (list 0 1) 2)])
  180. (make-node subset
  181. (split-feature-index best-split)
  182. (split-value best-split)
  183. (make-leaf-node (list #(2.3 1.0 1)
  184. #(2.0 1.0 1)
  185. #(2.3 1.0 1)
  186. #(2.0 1.0 1)
  187. #(2.4 1.0 1)))
  188. (make-leaf-node (list #(2.3 1.1 0)
  189. #(2.0 1.1 0))))))
  190. (fit #:train-data test-data
  191. #:feature-column-indices (list 0 1)
  192. #:label-column-index 2
  193. #:max-depth 3
  194. #:min-data-points 2
  195. #:min-data-points-ratio 0.02)))
  196. (let* ([test-data (list #(2.3 1.1 0)
  197. #(2.0 1.1 0)
  198. #(2.3 1.0 1)
  199. #(2.0 1.0 1)
  200. #(2.3 1.0 1)
  201. #(2.0 1.0 1)
  202. #(2.4 1.0 1))]
  203. [best-split (get-best-split test-data (list 0 1) 2)])
  204. (test-equal
  205. (make-node test-data
  206. (split-feature-index best-split)
  207. (split-value best-split)
  208. (make-leaf-node (list #(2.3 1.0 1)
  209. #(2.0 1.0 1)
  210. #(2.3 1.0 1)
  211. #(2.0 1.0 1)
  212. #(2.4 1.0 1)))
  213. (make-leaf-node (list #(2.3 1.1 0)
  214. #(2.0 1.1 0))))
  215. (fit #:train-data test-data
  216. #:feature-column-indices (list 0 1)
  217. #:label-column-index 2
  218. #:max-depth 3
  219. #:min-data-points 2
  220. #:min-data-points-ratio 0.02))))
  221. (test-group
  222. "column-uniform?"
  223. (test-assert "column-uniform? of empty column should be true"
  224. (column-uniform? empty-dataset =))
  225. (test-assert "column-uniform? of uniform column should result in true -- 1"
  226. (column-uniform? (list 1 1 1) =))
  227. (test-assert "column-uniform? of uniform column should result in true -- 2"
  228. (column-uniform?
  229. (dataset-get-col
  230. (list #(1.0 1.0 0)
  231. #(1.2 1.0 0)
  232. #(1.1 1.0 0)
  233. #(1.4 1.0 0)
  234. #(1.2 1.0 0)
  235. #(1.2 1.0 0))
  236. 2)
  237. =))
  238. (test-assert "column-uniform? of non-uniform column should result in false"
  239. (not
  240. (column-uniform? (list 1 2 3) =))))
  241. (test-group
  242. "dataset-partition"
  243. (test-equal "dataset-partition should split at given value of specified column"
  244. (list (list #(2.3 1.0 0)
  245. #(2.0 1.0 0)
  246. #(2.3 1.0 0)
  247. #(2.0 1.0 0)
  248. #(2.4 1.0 0))
  249. (list #(2.3 1.1 1)
  250. #(2.0 1.1 1)))
  251. (receive (matching not-matching)
  252. (dataset-partition (lambda (data-point)
  253. (= (data-point-get-col data-point 2) 0))
  254. (list #(2.3 1.1 1)
  255. #(2.0 1.1 1)
  256. #(2.3 1.0 0)
  257. #(2.0 1.0 0)
  258. #(2.3 1.0 0)
  259. #(2.0 1.0 0)
  260. #(2.4 1.0 0)))
  261. (list matching not-matching))))
  262. (test-group
  263. "cross-validation-split"
  264. (test-equal
  265. (list '(6 19 13 0 10)
  266. '(2 16 3 17 4)
  267. '(11 8 7 14 1)
  268. '(5 12 18 9 15))
  269. (cross-validation-split '(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19)
  270. 4
  271. #:random-seed 12345)))
  272. (test-group
  273. "leave-one-out-k-folds"
  274. (test-equal
  275. (list (list #(1 1)
  276. #(1 1)
  277. #(1 1)
  278. #(1 1))
  279. (list #(2 2)
  280. #(2 2)
  281. #(2 2)
  282. #(2 2))
  283. (list #(4 4)
  284. #(4 4)
  285. #(4 4)
  286. #(4 4)))
  287. (leave-one-out-k-folds (list (list #(1 1)
  288. #(1 1)
  289. #(1 1)
  290. #(1 1))
  291. (list #(2 2)
  292. #(2 2)
  293. #(2 2)
  294. #(2 2))
  295. (list #(3 3)
  296. #(3 3)
  297. #(3 3)
  298. #(3 3))
  299. (list #(4 4)
  300. #(4 4)
  301. #(4 4)
  302. #(4 4)))
  303. (list #(3 3)
  304. #(3 3)
  305. #(3 3)
  306. #(3 3)))))
  307. (test-group
  308. "select-min-cost-split"
  309. (test-equal "select-min-cost-split selects best of 3 splits"
  310. (make-split 2 9.78 '() 0.0)
  311. (select-min-cost-split (make-split 0 1.1 '() 2.0)
  312. (make-split 1 2.67 '() 1.0)
  313. (make-split 2 9.78 '() 0.0))))
  314. (test-group
  315. "evaluate-algorithm "
  316. (test-equal
  317. 4
  318. (length
  319. (evaluate-algorithm
  320. #:dataset TEST-DATA
  321. #:n-folds 4
  322. #:feature-column-indices (list 0 1)
  323. #:label-column-index 2
  324. #:max-depth 3
  325. #:min-data-points 4
  326. #:min-data-points-ratio 0.02
  327. #:min-impurity-split (expt 10 -7)
  328. #:stop-at-no-impurity-improvement #t
  329. #:random-seed 0)))
  330. ;; TODO: real test cose
  331. )
  332. (test-end "decision-tree-test")