ban-mixed-scripts.pl 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. # Copyright (C) 2018 Alex Schroeder <alex@gnu.org>
  2. # This program is free software: you can redistribute it and/or modify it under
  3. # the terms of the GNU General Public License as published by the Free Software
  4. # Foundation, either version 3 of the License, or (at your option) any later
  5. # version.
  6. #
  7. # This program is distributed in the hope that it will be useful, but WITHOUT
  8. # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  9. # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  10. #
  11. # You should have received a copy of the GNU General Public License along with
  12. # this program. If not, see <http://www.gnu.org/licenses/>.
  13. =encoding utf8
  14. =head1 Mixed Scripts
  15. This module disallows ordinary users from posting words that consist of multiple
  16. scripts. Stuff like this: "It's diffіcult to find knowledgeable people on this
  17. topic, but youu sound like you know wgat you're taⅼkіng аboսt!" Did you notice
  18. the confusable characters? The sentence contains the following:
  19. ARMENIAN SMALL LETTER SEH
  20. CYRILLIC SMALL LETTER A
  21. CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
  22. SMALL ROMAN NUMERAL FIFTY
  23. =cut
  24. use strict;
  25. use v5.10;
  26. use Unicode::UCD qw(charprop);
  27. AddModuleDescription('ban-mixed-scripts.pl', 'Ban Mixed Scripts Extension');
  28. *OldBanMixedScriptsBannedContent = \&BannedContent;
  29. *BannedContent = \&NewBanMixedScriptsBannedContent;
  30. sub NewBanMixedScriptsBannedContent {
  31. my $rule = OldBanMixedScriptsBannedContent(@_);
  32. $rule ||= BanMixedScript(@_);
  33. return $rule;
  34. }
  35. sub BanMixedScript {
  36. my $str = shift;
  37. my @words = $str =~ m/\w+/g;
  38. my %seen;
  39. my %prop;
  40. for my $word (@words) {
  41. next if $seen{$word};
  42. $seen{$word} = 1;
  43. my $script;
  44. for my $char (split(//, $word)) {
  45. my $s = $prop{$char};
  46. if (not $s) {
  47. $s = charprop(ord($char), "Script_Extensions");
  48. if ($s eq 'Hiragana') {
  49. $s = 'Han'; # this mixing is ok
  50. }
  51. $prop{$char} = $s;
  52. }
  53. next if $s eq "Common";
  54. if (not $script) {
  55. $script = $s;
  56. } elsif ($script ne $s) {
  57. return "Mixed scripts in $word ($script and $s, if not more)";
  58. }
  59. }
  60. }
  61. }