split-quality-measure.scm 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. (define-module (split-quality-measure))
  2. (use-modules
  3. ;; SRFI-1 for additional list procedures
  4. (srfi srfi-1)
  5. (tree)
  6. (dataset)
  7. (data-point))
  8. (define-public calc-proportion
  9. (lambda (subset class-label label-column-index)
  10. "Calculate the proportion of data points with the given label in the given
  11. subset, compared to the data points with other labels."
  12. (cond
  13. [(dataset-empty? subset) 0]
  14. [else
  15. (let* ([row-count (dataset-length subset)]
  16. [class-count
  17. (count (lambda (data-point)
  18. (= (data-point-get-col data-point label-column-index)
  19. class-label))
  20. subset)]
  21. [prop (/ class-count row-count)])
  22. (* prop (- 1.0 prop)))])))
  23. ;; The procedure gini-index is used to evaluate the quality of a split. It is a
  24. ;; cost function for a split. We want to keep the costs for splits low. (also:
  25. ;; greedy) There are other ways of calculating the quality of a split, but for
  26. ;; now we implement gini index.
  27. (define-public gini-index
  28. (lambda (subsets label-column-index)
  29. "Calculate the gini index quality measure, based on the result of a split."
  30. (apply +
  31. (map (lambda (subset)
  32. ;; For now assume labels are 0 or 1. Binary classification.
  33. ;; FUTURE TODO: In the future one might make this more flexible
  34. ;; by giving the labels as argument.
  35. (let ([labels '(0 1)])
  36. ;; For each subset calculate the proportion for each label.
  37. (apply +
  38. (map (lambda (label)
  39. (calc-proportion subset label label-column-index))
  40. labels))))
  41. subsets))))