wordstem.pl 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. # This program is free software; you can redistribute it and/or modify
  2. # it under the terms of the GNU General Public License as published by
  3. # the Free Software Foundation; either version 2 of the License, or
  4. # (at your option) any later version.
  5. #
  6. # This program is distributed in the hope that it will be useful,
  7. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. # GNU General Public License for more details.
  10. #
  11. # You should have received a copy of the GNU General Public License
  12. # along with this program; if not, write to the
  13. # Free Software Foundation, Inc.
  14. # 59 Temple Place, Suite 330
  15. # Boston, MA 02111-1307 USA
  16. #
  17. # Porter stemming algorithm code copied verbatim from http://www.tartarus.org/~martin/PorterStemmer/
  18. use strict;
  19. use v5.10;
  20. AddModuleDescription('wordstem.pl', 'WordStemming');
  21. *OldStemmingResolveId = \&ResolveId;
  22. *ResolveId = \&NewStemmingResolveId;
  23. initialise();
  24. my %StemmedPages = ();
  25. sub NewStemmingResolveId {
  26. my $id = shift;
  27. my ($class, $resolved, $title, $exists) = OldStemmingResolveId($id);
  28. return ($class, $resolved, $title, $exists) if $resolved;
  29. if (not %StemmedPages) {
  30. foreach my $page (AllPagesList()) {
  31. $StemmedPages{&stemWord($page)} = $page;
  32. }
  33. }
  34. my $page = &stemWord($id);
  35. if ($StemmedPages{$page}) {
  36. return ('local stemmed', $StemmedPages{$page}, $StemmedPages{$page}, undef);
  37. }
  38. }
  39. my %step2list;
  40. my %step3list;
  41. my ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);
  42. sub stem
  43. { my ($stem, $suffix, $firstch);
  44. my $w = shift;
  45. if (length($w) < 3) { return $w; } # length at least 3
  46. # now map initial y to Y so that the patterns never treat it as vowel:
  47. $w =~ /^./; $firstch = $&;
  48. if ($firstch =~ /^y/) { $w = ucfirst $w; }
  49. # Step 1a
  50. if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
  51. elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
  52. # Step 1b
  53. if ($w =~ /eed$/) { if ($` =~ /$mgr0/) { chop($w); } }
  54. elsif ($w =~ /(ed|ing)$/)
  55. { $stem = $`;
  56. if ($stem =~ /$_v/)
  57. { $w = $stem;
  58. if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
  59. elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
  60. elsif ($w =~ /^${C}${v}[^aeiouwxy]$/) { $w .= "e"; }
  61. }
  62. }
  63. # Step 1c
  64. if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/) { $w = $stem."i"; } }
  65. # Step 2
  66. if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/)
  67. { $stem = $`; $suffix = $1;
  68. if ($stem =~ /$mgr0/) { $w = $stem . $step2list{$suffix}; }
  69. }
  70. # Step 3
  71. if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/)
  72. { $stem = $`; $suffix = $1;
  73. if ($stem =~ /$mgr0/) { $w = $stem . $step3list{$suffix}; }
  74. }
  75. # Step 4
  76. if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
  77. { $stem = $`; if ($stem =~ /$mgr1/) { $w = $stem; } }
  78. elsif ($w =~ /(s|t)(ion)$/)
  79. { $stem = $` . $1; if ($stem =~ /$mgr1/) { $w = $stem; } }
  80. # Step 5
  81. if ($w =~ /e$/)
  82. { $stem = $`;
  83. if ($stem =~ /$mgr1/ or
  84. ($stem =~ /$meq1/ and not $stem =~ /^${C}${v}[^aeiouwxy]$/))
  85. { $w = $stem; }
  86. }
  87. if ($w =~ /ll$/ and $w =~ /$mgr1/) { chop($w); }
  88. # and turn initial Y back to y
  89. if ($firstch =~ /^y/) { $w = lcfirst $w; }
  90. return $w;
  91. }
  92. sub initialise {
  93. %step2list =
  94. ( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble',
  95. 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate',
  96. 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al',
  97. 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');
  98. %step3list =
  99. ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');
  100. $c = "[^aeiou]"; # consonant
  101. $v = "[aeiouy]"; # vowel
  102. $C = "${c}[^aeiouy]*"; # consonant sequence
  103. $V = "${v}[aeiou]*"; # vowel sequence
  104. $mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0
  105. $meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1
  106. $mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1
  107. $_v = "^(${C})?${v}"; # vowel in stem
  108. }
  109. sub stemWord {
  110. my $page = shift;
  111. my $oldpage = $page;
  112. $page = "";
  113. # Split the word up at case changes and stem each subword
  114. my @words = split(/([a-z]*)([A-Z]+[a-z]+)/,$oldpage);
  115. foreach my $w(@words) {
  116. if ($w) {
  117. if ($w =~ /_/) { # Possible word separated by _
  118. my @subwords = split(/_/,$w);
  119. foreach my $w(@subwords) {
  120. if ($w) {
  121. $page .= lc(&stem($w)); #Force case changes to not matter
  122. }
  123. }
  124. }
  125. else{
  126. $page .= lc(&stem($w));
  127. }
  128. }
  129. }
  130. return $page;
  131. }