DirectionDetectorPlugin.php 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. <?php
  2. /**
  3. * DirectionDetector plugin, detects notices with RTL content & sets RTL
  4. * style for them.
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. *
  19. * @category Plugin
  20. * @package StatusNet
  21. * @author Behrooz shabani (everplays) - <behrooz@rock.com>
  22. * @copyright 2009-2010 Behrooz shabani
  23. * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
  24. *
  25. */
  26. if (!defined('STATUSNET')) {
  27. exit(1);
  28. }
  29. define('DIRECTIONDETECTORPLUGIN_VERSION', '0.2.0');
  30. class DirectionDetectorPlugin extends Plugin {
  31. /**
  32. * SN plugin API, here we will make changes on rendered column
  33. *
  34. * @param object $notice notice is going to be saved
  35. */
  36. public function onStartNoticeSave($notice){
  37. if(!preg_match('/<span class="rtl">/', $notice->rendered) && self::isRTL($notice->content))
  38. $notice->rendered = '<span class="rtl">'.$notice->rendered.'</span>';
  39. return true;
  40. }
  41. /**
  42. * is passed string a rtl content or not
  43. *
  44. * @param string $content
  45. * @return boolean
  46. */
  47. public static function isRTL($content){
  48. $content = self::getClearText($content);
  49. $words = explode(' ', $content);
  50. $rtl = 0;
  51. foreach($words as $str)
  52. if(self::startsWithRTLCharacter($str))
  53. $rtl++;
  54. else
  55. $rtl--;
  56. if($rtl>0)// if number of rtl words is more than ltr words so it's a rtl content
  57. return true;
  58. elseif($rtl==0)
  59. // check first word again
  60. return self::startsWithRTLCharacter($words[0]);
  61. return false;
  62. }
  63. /**
  64. * checks that passed string starts with a RTL language or not
  65. *
  66. * @param string $str
  67. * @return boolean
  68. */
  69. public static function startsWithRTLCharacter($str){
  70. if (strlen($str) < 1) {
  71. return false;
  72. }
  73. if( is_array($cc = self::utf8ToUnicode(mb_substr($str, 0, 1, 'utf-8'))) )
  74. $cc = $cc[0];
  75. else
  76. return false;
  77. if($cc>=1536 && $cc<=1791) // arabic, persian, urdu, kurdish, ...
  78. return true;
  79. if($cc>=65136 && $cc<=65279) // arabic peresent 2
  80. return true;
  81. if($cc>=64336 && $cc<=65023) // arabic peresent 1
  82. return true;
  83. if($cc>=1424 && $cc<=1535) // hebrew
  84. return true;
  85. if($cc>=64256 && $cc<=64335) // hebrew peresent
  86. return true;
  87. if($cc>=1792 && $cc<=1871) // Syriac
  88. return true;
  89. if($cc>=1920 && $cc<=1983) // Thaana
  90. return true;
  91. if($cc>=1984 && $cc<=2047) // NKo
  92. return true;
  93. if($cc>=11568 && $cc<=11647) // Tifinagh
  94. return true;
  95. return false;
  96. }
  97. /**
  98. * clears text from replys, tags, groups, reteets & whitespaces
  99. *
  100. * @param string $str
  101. * @return string
  102. */
  103. private static function getClearText($str){
  104. $str = preg_replace('/@[^ ]+|![^ ]+|#[^ ]+/u', '', $str); // reply, tag, group
  105. $str = preg_replace('/^RT[: ]{1}| RT | RT: |^RD[: ]{1}| RD | RD: |[♺♻:]/u', '', $str); // redent, retweet
  106. $str = preg_replace("/[ \r\t\n]+/", ' ', trim($str)); // remove spaces
  107. return $str;
  108. }
  109. /**
  110. * adds javascript to do same thing on input textarea
  111. *
  112. * @param Action $action
  113. */
  114. function onEndShowScripts($action){
  115. if (common_logged_in()) {
  116. $action->script($this->path('jquery.DirectionDetector.js'));
  117. }
  118. }
  119. /**
  120. * Takes an UTF-8 string and returns an array of ints representing the
  121. * Unicode characters. Astral planes are supported ie. the ints in the
  122. * output can be > 0xFFFF. O$ccurrances of the BOM are ignored. Surrogates
  123. * are not allowed.
  124. *
  125. * @param string $str
  126. * @return mixed array of ints, or false on invalid input
  127. */
  128. private static function utf8ToUnicode($str){
  129. $mState = 0; // cached expected number of octets after the current octet
  130. // until the beginning of the next UTF8 character sequence
  131. $mUcs4 = 0; // cached Unicode character
  132. $mBytes = 1; // cached expected number of octets in the current sequence
  133. $out = array();
  134. $len = strlen($str);
  135. for($i = 0; $i < $len; $i++) {
  136. $in = ord($str{$i});
  137. if (0 == $mState) {
  138. // When mState is zero we expect either a US-ASCII character or a
  139. // multi-octet sequence.
  140. if (0 == (0x80 & ($in))) {
  141. // US-ASCII, pass straight through.
  142. $out[] = $in;
  143. $mBytes = 1;
  144. } elseif (0xC0 == (0xE0 & ($in))) {
  145. // First octet of 2 octet sequence
  146. $mUcs4 = ($in);
  147. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  148. $mState = 1;
  149. $mBytes = 2;
  150. } elseif (0xE0 == (0xF0 & ($in))) {
  151. // First octet of 3 octet sequence
  152. $mUcs4 = ($in);
  153. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  154. $mState = 2;
  155. $mBytes = 3;
  156. } elseif (0xF0 == (0xF8 & ($in))) {
  157. // First octet of 4 octet sequence
  158. $mUcs4 = ($in);
  159. $mUcs4 = ($mUcs4 & 0x07) << 18;
  160. $mState = 3;
  161. $mBytes = 4;
  162. } elseif (0xF8 == (0xFC & ($in))) {
  163. /* First octet of 5 octet sequence.
  164. *
  165. * This is illegal because the encoded codepoint must be either
  166. * (a) not the shortest form or
  167. * (b) outside the Unicode range of 0-0x10FFFF.
  168. * Rather than trying to resynchronize, we will carry on until the end
  169. * of the sequence and let the later error handling code catch it.
  170. */
  171. $mUcs4 = ($in);
  172. $mUcs4 = ($mUcs4 & 0x03) << 24;
  173. $mState = 4;
  174. $mBytes = 5;
  175. } elseif (0xFC == (0xFE & ($in))) {
  176. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  177. $mUcs4 = ($in);
  178. $mUcs4 = ($mUcs4 & 1) << 30;
  179. $mState = 5;
  180. $mBytes = 6;
  181. } else {
  182. /* Current octet is neither in the US-ASCII range nor a legal first
  183. * octet of a multi-octet sequence.
  184. */
  185. return false;
  186. }
  187. } else {
  188. // When mState is non-zero, we expect a continuation of the multi-octet
  189. // sequence
  190. if (0x80 == (0xC0 & ($in))) {
  191. // Legal continuation.
  192. $shift = ($mState - 1) * 6;
  193. $tmp = $in;
  194. $tmp = ($tmp & 0x0000003F) << $shift;
  195. $mUcs4 |= $tmp;
  196. if (0 == --$mState) {
  197. /* End of the multi-octet sequence. mUcs4 now contains the final
  198. * Unicode codepoint to be output
  199. *
  200. * Check for illegal sequences and codepoints.
  201. */
  202. // From Unicode 3.1, non-shortest form is illegal
  203. if (
  204. ((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  205. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  206. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
  207. (4 < $mBytes) ||
  208. // From Unicode 3.2, surrogate characters are illegal
  209. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  210. // Codepoints outside the Unicode range are illegal
  211. ($mUcs4 > 0x10FFFF)
  212. ){
  213. return false;
  214. }
  215. if (0xFEFF != $mUcs4) {
  216. $out[] = $mUcs4;
  217. }
  218. //initialize UTF8 cache
  219. $mState = 0;
  220. $mUcs4 = 0;
  221. $mBytes = 1;
  222. }
  223. } else {
  224. /* ((0xC0 & (*in) != 0x80) && (mState != 0))
  225. *
  226. * Incomplete multi-octet sequence.
  227. */
  228. return false;
  229. }
  230. }
  231. }
  232. return $out;
  233. }
  234. /**
  235. * plugin details
  236. */
  237. function onPluginVersion(&$versions){
  238. $url = 'http://status.net/wiki/Plugin:DirectionDetector';
  239. $versions[] = array(
  240. 'name' => 'Direction detector',
  241. 'version' => DIRECTIONDETECTORPLUGIN_VERSION,
  242. 'author' => 'Behrooz Shabani',
  243. 'homepage' => $url,
  244. // TRANS: Plugin description.
  245. 'rawdescription' => _m('Shows notices with right-to-left content in correct direction.')
  246. );
  247. return true;
  248. }
  249. }
  250. /*
  251. // Example:
  252. var_dump(DirectionDetectorPlugin::isRTL('RT @everplays ♺: دادگاه به دليل عدم حضور وکلای متهمان بنا بر اصل ١٣٥ قانون اساسی غير قانونی است')); // true
  253. */