tags.go 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. //
  2. // Copyright (C) 2017-2021 Marcus Rohrmoser, http://purl.mro.name/ShaarliGo
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. //
  17. package main
  18. import (
  19. "bufio"
  20. "sort"
  21. "strings"
  22. "unicode"
  23. "golang.org/x/text/transform"
  24. "golang.org/x/text/unicode/norm"
  25. )
  26. // https://stackoverflow.com/a/39425959
  27. // http://cldr-build.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3Aemoji%3A%5D&g=emoji
  28. func isEmojiRune(ru rune) bool {
  29. return ru == '@' || ru == '§' || ru == '$' || ru == '†' ||
  30. ('\u20d0' <= ru && ru <= '\u20ff') || // Combining Diacritical Marks for Symbols
  31. ('\u2328' == ru) || // keyboard
  32. ('\u238c' <= ru && ru <= '\u2454') || // Misc items
  33. ('\u2600' <= ru && ru <= '\u26FF') || // Misc symbols
  34. ('\u2700' <= ru && ru <= '\u27BF') || // Dingbats
  35. ('\u2b50' == ru) || // star
  36. ('\uFE00' <= ru && ru <= '\uFE0F') || // Variation Selectors
  37. ('\U0001f018' <= ru && ru <= '\U0001f270') || // Various asian characters
  38. ('\U0001F1E6' <= ru && ru <= '\U0001F1FF') || // Regional country flags
  39. ('\U0001F300' <= ru && ru <= '\U0001F5FF') || // Misc Symbols and Pictographs
  40. ('\U0001F600' <= ru && ru <= '\U0001F64F') || // Emoticons
  41. ('\U0001F680' <= ru && ru <= '\U0001F6FF') || // Transport and Map
  42. ('\U0001F900' <= ru && ru <= '\U0001F9FF') // Supplemental Symbols and Pictographs
  43. }
  44. const tpf = '#'
  45. /*
  46. * nota bene https://www.unicode.org/reports/tr31/#D2
  47. *
  48. * [\p{L}\p{Nl}\p{Other_ID_Start}-\p{Pattern_Syntax}-\p{Pattern_White_Space}]
  49. */
  50. func isIdStart(c rune) bool {
  51. if isEmojiRune(c) {
  52. return true
  53. }
  54. if unicode.In(c, unicode.Pattern_Syntax, unicode.Pattern_White_Space) {
  55. return false
  56. }
  57. return unicode.In(c, unicode.L, unicode.Nl, unicode.Other_ID_Start)
  58. }
  59. /*
  60. * nota bene https://www.unicode.org/reports/tr31/#D2
  61. *
  62. * [\p{ID_Start}\p{Mn}\p{Mc}\p{Nd}\p{Pc}\p{Other_ID_Continue}-\p{Pattern_Syntax}-\p{Pattern_White_Space}]
  63. */
  64. func isContinue(c rune) bool {
  65. if unicode.In(c, unicode.Pattern_Syntax, unicode.Pattern_White_Space) {
  66. return false
  67. }
  68. return isIdStart(c) || unicode.In(c, unicode.Mn, unicode.Mc, unicode.Nd, unicode.Pc, unicode.Other_ID_Continue)
  69. }
  70. /* nota bene https://www.unicode.org/reports/tr31/#D2 */
  71. func isTag(tag string) string {
  72. if "" == tag {
  73. return ""
  74. }
  75. started := false
  76. ret := ""
  77. for _, c := range tag {
  78. if !started {
  79. if tpf == c {
  80. started = true
  81. continue
  82. } else {
  83. if isEmojiRune(c) {
  84. started = true
  85. ret = ret + string(c)
  86. continue
  87. }
  88. }
  89. break
  90. }
  91. if ret == "" {
  92. if isIdStart(c) {
  93. ret = ret + string(c)
  94. } else {
  95. break
  96. }
  97. } else {
  98. if isContinue(c) {
  99. ret = ret + string(c)
  100. } else {
  101. break
  102. }
  103. }
  104. }
  105. return ret
  106. }
  107. func tagsFromString(str string) []string {
  108. scanner := bufio.NewScanner(strings.NewReader(str))
  109. scanner.Split(bufio.ScanWords)
  110. ret := make([]string, 0, 10)
  111. tmp := make(map[string]struct{}, 10)
  112. tmp[""] = struct{}{}
  113. for scanner.Scan() {
  114. tag := isTag(scanner.Text())
  115. if _, ok := tmp[tag]; ok {
  116. continue
  117. }
  118. ret = append(ret, tag)
  119. tmp[tag] = struct{}{}
  120. }
  121. return ret
  122. }
  123. // https://stackoverflow.com/a/26722698
  124. func fold(str string) string {
  125. tr := transform.Chain(norm.NFD, transform.RemoveFunc(func(r rune) bool {
  126. return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
  127. }), norm.NFC)
  128. // todo: chain lowercase + trim
  129. if result, _, err := transform.String(tr, str); err != nil {
  130. panic(err)
  131. } else {
  132. return strings.TrimSpace(strings.ToLower(result))
  133. }
  134. }
  135. func tagsVisitor(tags ...string) func(func(string)) {
  136. return func(callback func(string)) {
  137. for _, tag := range tags {
  138. callback(tag)
  139. }
  140. }
  141. }
  142. func tagsNormalise(ds string, ex string, tavi func(func(string)), knovi func(func(string))) (description string, extended string, tags []string) {
  143. knodi := make(map[string]string, 1000)
  144. knovi(func(tag string) { knodi[fold(tag)] = tag })
  145. tags = make([]string, 0, 20)
  146. // 1. iterate text tags
  147. tadi := make(map[string]string, 20)
  148. tadi[""] = ""
  149. add := func(tag string) string {
  150. k := fold(tag)
  151. if _, ok := tadi[k]; ok {
  152. return ""
  153. }
  154. if v, ok := knodi[k]; ok {
  155. tag = v
  156. }
  157. tadi[k] = tag
  158. tags = append(tags, tag) // updating the reference correctly?
  159. return tag
  160. }
  161. for _, tag := range append(tagsFromString(ds), tagsFromString(ex)...) {
  162. add(tag)
  163. }
  164. // 2. visit all previous tags and add missing ones to tadi and extended
  165. tavi(func(tag string) {
  166. if t := add(tag); t == "" {
  167. return
  168. }
  169. ex += " #" + tag // todo: skip superfluous # before emojis
  170. })
  171. description = strings.TrimSpace(ds)
  172. extended = strings.TrimSpace(ex)
  173. sort.Strings(tags)
  174. return
  175. }