MelderReadText.cpp 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. /* melder_readtext.cpp
  2. *
  3. * Copyright (C) 2008,2010-2012,2014-2017 Paul Boersma
  4. *
  5. * This code is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This code is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. * See the GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this work. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. #include "melder.h"
  19. #include "../kar/UnicodeData.h"
  20. char32 MelderReadText_getChar (MelderReadText me) {
  21. if (my string32) {
  22. if (* my readPointer32 == U'\0') return U'\0';
  23. return * my readPointer32 ++;
  24. } else {
  25. if (* my readPointer8 == '\0') return U'\0';
  26. if (my input8Encoding == kMelder_textInputEncoding::UTF8) {
  27. char32 kar1 = (char32) (char8) * my readPointer8 ++;
  28. if (kar1 <= 0x00'007F) {
  29. return kar1;
  30. } else if (kar1 <= 0x00'00DF) {
  31. char32 kar2 = (char32) (char8) * my readPointer8 ++;
  32. return ((kar1 & 0x00'001F) << 6) | (kar2 & 0x00'003F);
  33. } else if (kar1 <= 0x00'00EF) {
  34. char32 kar2 = (char32) (char8) * my readPointer8 ++;
  35. char32 kar3 = (char32) (char8) * my readPointer8 ++;
  36. return ((kar1 & 0x00'000F) << 12) | ((kar2 & 0x00'003F) << 6) | (kar3 & 0x00'003F);
  37. } else if (kar1 <= 0x00'00F4) {
  38. char32 kar2 = (char32) (char8) * my readPointer8 ++;
  39. char32 kar3 = (char32) (char8) * my readPointer8 ++;
  40. char32 kar4 = (char32) (char8) * my readPointer8 ++;
  41. return ((kar1 & 0x00'0007) << 18) | ((kar2 & 0x00'003F) << 12) | ((kar3 & 0x00'003F) << 6) | (kar4 & 0x00'003F);
  42. } else {
  43. return UNICODE_REPLACEMENT_CHARACTER;
  44. }
  45. } else if (my input8Encoding == kMelder_textInputEncoding::MACROMAN) {
  46. return Melder_decodeMacRoman [(char8) * my readPointer8 ++];
  47. } else if (my input8Encoding == kMelder_textInputEncoding::WINDOWS_LATIN1) {
  48. return Melder_decodeWindowsLatin1 [(char8) * my readPointer8 ++];
  49. } else {
  50. /* Unknown encoding. */
  51. return (char32) (char8) * my readPointer8 ++;
  52. }
  53. }
  54. }
  55. mutablestring32 MelderReadText_readLine (MelderReadText me) {
  56. if (my string32) {
  57. Melder_assert (my readPointer32);
  58. Melder_assert (! my readPointer8);
  59. if (*my readPointer32 == U'\0') // tried to read past end of file
  60. return nullptr;
  61. char32 *result = my readPointer32;
  62. char32 *newline = str32chr (result, U'\n');
  63. if (newline) {
  64. *newline = U'\0';
  65. my readPointer32 = newline + 1;
  66. } else {
  67. my readPointer32 += str32len (result);
  68. }
  69. return result;
  70. } else {
  71. Melder_assert (my string8);
  72. Melder_assert (! my readPointer32);
  73. Melder_assert (my readPointer8);
  74. if (*my readPointer8 == '\0') // tried to read past end of file
  75. return nullptr;
  76. char *result8 = my readPointer8;
  77. char *newline = strchr (result8, '\n');
  78. if (newline) {
  79. *newline = '\0';
  80. my readPointer8 = newline + 1;
  81. } else {
  82. my readPointer8 += strlen (result8);
  83. }
  84. static char32 *text32 = nullptr;
  85. static int64 size = 0;
  86. int64 sizeNeeded = (int64) strlen (result8) + 1;
  87. if (sizeNeeded > size) {
  88. Melder_free (text32);
  89. text32 = Melder_malloc_f (char32, sizeNeeded + 100);
  90. size = sizeNeeded + 100;
  91. }
  92. Melder_8to32_inplace (result8, text32, my input8Encoding);
  93. return text32;
  94. }
  95. }
  96. int64 MelderReadText_getNumberOfLines (MelderReadText me) {
  97. int64 n = 0;
  98. if (my string32) {
  99. char32 *p = & my string32 [0];
  100. for (; *p != U'\0'; p ++) {
  101. if (*p == U'\n')
  102. n ++;
  103. }
  104. if (p - & my string32 [0] > 1 && p [-1] != U'\n')
  105. n ++;
  106. } else {
  107. char *p = & my string8 [0];
  108. for (; *p != '\0'; p ++) {
  109. if (*p == '\n')
  110. n ++;
  111. }
  112. if (p - & my string8 [0] > 1 && p [-1] != '\n')
  113. n ++;
  114. }
  115. return n;
  116. }
  117. conststring32 MelderReadText_getLineNumber (MelderReadText me) {
  118. int64 result = 1;
  119. if (my string32) {
  120. char32 *p = & my string32 [0];
  121. while (my readPointer32 - p > 0) {
  122. if (*p == U'\0' || *p == U'\n')
  123. result ++;
  124. p ++;
  125. }
  126. } else {
  127. char *p = & my string8 [0];
  128. while (my readPointer8 - p > 0) {
  129. if (*p == '\0' || *p == '\n')
  130. result ++;
  131. p ++;
  132. }
  133. return Melder_integer (result);
  134. }
  135. return Melder_integer (result);
  136. }
  137. autoMelderReadText MelderReadText_createFromFile (MelderFile file) {
  138. autoMelderReadText me = std::make_unique <structMelderReadText> ();
  139. my string32 = MelderFile_readText (file, & my string8);
  140. if (my string32) {
  141. my readPointer32 = & my string32 [0];
  142. } else {
  143. Melder_assert (my string8);
  144. my readPointer8 = & my string8 [0];
  145. my input8Encoding = Melder_getInputEncoding ();
  146. if (my input8Encoding == kMelder_textInputEncoding::UTF8 ||
  147. my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1 ||
  148. my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1 ||
  149. my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN)
  150. {
  151. if (Melder_str8IsValidUtf8 (my string8.get())) {
  152. my input8Encoding = kMelder_textInputEncoding::UTF8;
  153. } else if (my input8Encoding == kMelder_textInputEncoding::UTF8) {
  154. Melder_throw (U"Text is not valid UTF-8; please try a different text input encoding.");
  155. } else if (my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1) {
  156. my input8Encoding = kMelder_textInputEncoding::ISO_LATIN1;
  157. } else if (my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1) {
  158. my input8Encoding = kMelder_textInputEncoding::WINDOWS_LATIN1;
  159. } else if (my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN) {
  160. my input8Encoding = kMelder_textInputEncoding::MACROMAN;
  161. }
  162. }
  163. }
  164. return me;
  165. }
  166. /* End of file melder_readtext.cpp */