translate.js 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744
  1. // TODO translate svg files
  2. // TODO refactor src/pages/*.html to use <lang.de> instead of <de> etc. -> make tags unambiguous
  3. // e.g. <tr> can be turkish or table row
  4. // TODO avoid double inserts
  5. // TODO verify: all text-fragments are inserted
  6. // TODO auto-detect sourceLangList from files in src/pages/*.html
  7. // TODO post-process old and manual translations
  8. // -> add/update rev="en#xxxxxxxx" (revision ID) to keep translations in sync
  9. // use different encoding than base64? base32 or base16 (hex) -> better for filenames
  10. // -> easier to build a content-addressable store to cache old versions
  11. // con: text fragments are small -> use one large database file, e.g. jsonlines format
  12. // collision safety? git uses short IDs of only 7 chars in base16
  13. // 12 safe symbols: []^'*-/_{}<>
  14. const codeNumKey = "^'*-/_{}<>"; // 10 digits
  15. const codeNumRegexCharClass = "\\^'*-/_{}<>"; // escape ^ for regex character class
  16. const removeRegexCharClass = [
  17. '\u200B', // ZERO WIDTH SPACE from google https://stackoverflow.com/questions/36744793
  18. ].join('');
  19. const artifactsRegexCharClass = removeRegexCharClass + [
  20. ' ', // space
  21. ].join('');
  22. const codeNumRegexCharClassImport = codeNumRegexCharClass + artifactsRegexCharClass;
  23. const encodeNumTable = Object.fromEntries(codeNumKey.split('').map((c, i) => [i, c]));
  24. function encodeNum(num) {
  25. return num.toString().split('').map(i => encodeNumTable[i]).join('');
  26. }
  27. const decodeNumTable = Object.fromEntries(codeNumKey.split('').map((c, i) => [c, i]));
  28. function decodeNum(str) {
  29. return parseInt(str.replace(/\s+/sg, '').split('').map(c => decodeNumTable[c]).join(''));
  30. }
  31. const dryRunExport = 0;
  32. const dryRunImport = 0;
  33. const showDebug = 1;
  34. const charLimit = 5000; // limit of google, deepl
  35. //const charLimit = 1000; // good page size for manual translations or debugging
  36. let useXml = false;
  37. let translatorName = 'google';
  38. function main() {
  39. const argv = process.argv.slice(1); // argv[0] is node
  40. const langMap = {
  41. zh: 'zh-CN', // simplified chinese
  42. };
  43. function getLang(str) {
  44. if (str && str in langMap) return langMap[str];
  45. return str;
  46. }
  47. const sourceLang = getLang(argv[1]);
  48. const targetLang = getLang(argv[2]);
  49. const inputFile = argv[3];
  50. //const translatorName = 'google';
  51. translatorName = (
  52. translatorLangs.deepl.includes(targetLang) ? 'deepl' :
  53. 'google'
  54. );
  55. // DEBUG
  56. translatorName = 'google';
  57. // xml is broken in all translators
  58. // -> encode to "symbols in square braces"
  59. // which are preserved by all translators
  60. const xmlTranslators = [
  61. //'deepl', // not really. some xml is preserved, some xml is translated -> not usable
  62. ];
  63. useXml = xmlTranslators.includes(translatorName);
  64. if (sourceLang && targetLang && inputFile && fs.existsSync(inputFile)) {
  65. importLang(sourceLang, targetLang, inputFile);
  66. }
  67. else if (sourceLang && targetLang) {
  68. exportLang(sourceLang, targetLang);
  69. }
  70. else {
  71. showHelp();
  72. }
  73. }
  74. function showHelp() {
  75. const scriptName = 'scripts/' + __filename.split('/').pop();
  76. console.log(
  77. 'usage:\n' +
  78. `node ${scriptName} <sourceLang> <targetLang>\n` +
  79. `node ${scriptName} <sourceLang> <targetLang> <translationFile>\n` +
  80. '\n' +
  81. 'sample:\n' +
  82. `node ${scriptName} de en # from source files, generate translate-de2en.html\n` +
  83. `# manually create translate-de2en.txt\n` +
  84. `node ${scriptName} de en translate-de2en.txt # add <en auto t="${nowDate}">...</en> tags to source files\n` +
  85. `# manually fix the translations, and replace <en auto t="${nowDate}"> with <en>\n`
  86. //`node ${scriptName} translate-de2en.txt en\n`
  87. )
  88. }
  89. const translatorLangs = {
  90. deepl: [
  91. // 2021-05-25
  92. 'bg', 'zh', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'hu', 'it', 'ja',
  93. 'lv', 'lt', 'pl', 'pt', 'pt-PT', 'pt-BR', 'ro', 'ru', 'sk', 'sl', 'es', 'sv'
  94. ],
  95. };
  96. const previewTextLength = 500;
  97. const fs = require('fs');
  98. const appRoot = require('app-root-path').path;
  99. const path = require('path');
  100. const glob = require('fast-glob');
  101. const { parse } = require('node-html-parser'); // patched version github:taoqf/node-fast-html-parser#60ea8fee51f07fbc712b5642a0496f12748eb90f
  102. const htmlEntities = require('he');
  103. const elevConf = require(appRoot + '/config/eleventy.config.js')();
  104. process.chdir(appRoot);
  105. const scriptPath = path.relative(appRoot, process.argv[1]);
  106. const inputDir = elevConf.dir.input;
  107. const infilesGlob = inputDir + '/pages/*.html';
  108. const sourceLangList = ['de', 'en']; // TODO get from 11ty metadata
  109. // https://github.com/iansan5653/unraw/issues/29
  110. // deepl.com:
  111. // / -> \/
  112. // \ -> \\
  113. function deeplBackslashEncode(str) {
  114. let res = '';
  115. for (let i = 0; i < str.length; i++) {
  116. const char16bit = str[i];
  117. const code = char16bit.charCodeAt(0);
  118. res += (
  119. (code == 47) ? '\\/' : // forward slash
  120. (code == 92) ? '\\\\' : // backslash
  121. char16bit
  122. );
  123. }
  124. return res;
  125. }
  126. function dateTime(date = null) {
  127. // sample result: '2021-03-21.21-05-36'
  128. if (!date) date = new Date();
  129. return date.toLocaleString('lt').replace(/:/g, '-').replace(' ', '.');
  130. }
  131. const nowDate = dateTime();
  132. const crypto = require("crypto");
  133. function sha1sum(str) {
  134. return crypto.createHash("sha1").update(str).digest("base64");
  135. }
  136. // google can translate -- to -
  137. // so we use "safe" ids without repetition
  138. function getNextSafeId(lastId) {
  139. for (let id = (lastId + 1); ; id++) {
  140. let idStr = id.toString();
  141. let idSafe = true;
  142. for (let charIdx = 0; charIdx < (idStr.length - 1); charIdx++) {
  143. if (idStr[charIdx] == idStr[charIdx + 1]) {
  144. // found repetition
  145. idSafe = false;
  146. //if (showDebug) console.log(`skip unsafe id ${id}`);
  147. break;
  148. }
  149. }
  150. if (idSafe) return id;
  151. }
  152. }
  153. /////////////////////// export ////////////////////////////
  154. function exportLang(sourceLang = 'de', targetLang = 'en') {
  155. // TODO more dynamic ...
  156. // check src/pages/*.html if sourceLang is found
  157. if (sourceLangList.includes(sourceLang) == false) {
  158. console.log(`error: sourceLang ${sourceLang} not found. must be one of: ${sourceLangList.join(', ')}`);
  159. process.exit(1);
  160. }
  161. const htmlFile = `translate-${sourceLang}2${targetLang}.html`;
  162. if (!dryRunExport && fs.existsSync(htmlFile)) {
  163. console.log(`error: output file exists: ${htmlFile}`);
  164. console.log(`\nsolutions:`);
  165. console.log(`mv ${htmlFile} ${htmlFile}.${nowDate}.bak`);
  166. console.log(`rm ${htmlFile}`);
  167. process.exit(1);
  168. }
  169. console.log(`glob: ${infilesGlob}`);
  170. const textParts = [];
  171. //const replacementList = [];
  172. const replacementData = {};
  173. replacementData.replacementList = {}; // sparse array due to "safe" ids, see getNextSafeId
  174. //replacementData.indentList = [];
  175. replacementData.lastId = -1;
  176. function fmtNum(num) {
  177. // split long number in groups of three digits
  178. // https://stackoverflow.com/a/6786040/10440128
  179. return `${num}`.replace(/(\d)(?=(\d{3})+$)/g, '$1 ');
  180. }
  181. function getReplace(match) {
  182. // global: replacementData
  183. const replacementId = getNextSafeId(replacementData.lastId);
  184. replacementData.lastId = replacementId;
  185. replacementData.replacementList[replacementId] = {};
  186. replacementData.replacementList[replacementId].value = match;
  187. replacementData.replacementList[replacementId].code = encodeNum(replacementId);
  188. replacementData.replacementList[replacementId].indentList = [];
  189. return `\n[${encodeNum(replacementId)}]\n`;
  190. }
  191. // loop input files
  192. glob.sync(infilesGlob)
  193. //.slice(0, 2) // debug: process less input files
  194. .forEach((file, fileIdx) => {
  195. console.log(`input: ${file}`);
  196. const inputHtml = fs.readFileSync(file, 'utf8');
  197. const root = parse(inputHtml);
  198. const parentNodes = root.querySelectorAll('langs, .langs');
  199. // loop parentNodes -> get textParts
  200. for (const [pi, p] of parentNodes.entries()) {
  201. // loop nodes
  202. p.querySelectorAll(`${sourceLang}, *[lang="${sourceLang}"]`).forEach((n, ni) => {
  203. const wrap = (n.hasAttribute('lang') == false);
  204. const nodeStart = n._source.start; // only in patched version of html parser
  205. const lineStart = inputHtml.lastIndexOf('\n', n._source.start) + 1;
  206. const indent = inputHtml.slice(lineStart, nodeStart).match(/^\s*/)[0];
  207. //if (showDebug) console.log(`indent = ${JSON.stringify(indent)}`);
  208. //const tagName = targetLang;
  209. const tagName = `lang.${targetLang}`;
  210. //const tagAttrs = `generator="${translatorName}" t="${nowDate}"`;
  211. // base of translation = sourceText
  212. const base = `${sourceLang}#${sha1sum(n.innerHTML).slice(0, 8)}`;
  213. const extraAttrs = `rev="${base}"`; // add revision ID
  214. // TODO properly parse + replace attributes if wrap == false
  215. const sBase = indent + (wrap
  216. ? `<${tagName} ${extraAttrs}>${n.innerHTML}</${tagName}>`
  217. : n.outerHTML.replace(new RegExp(`^<([^>\\s]+)\\s+[^>]*lang="${sourceLang}"[^>]*>`, 's'), `<$1 lang="${targetLang}" ${extraAttrs}>`)
  218. );
  219. if (showDebug) console.dir({ indent, wrap, tagName, extraAttrs });
  220. const sXml = `<html f="${fileIdx}" p="${pi}" n="${ni}">\n${sBase}\n</html>`;
  221. if (showDebug) console.log(`textPart before replace:\n${sXml}`);
  222. // encode html
  223. // replace with "symbols in square braces"
  224. // consume all whitespace around the original value
  225. let textPart = sXml.replace(
  226. new RegExp(
  227. [
  228. `\\s*`, // space before
  229. `(?:`,
  230. `\\[[${codeNumRegexCharClassImport}]+\\]`, // "symbols in square braces"
  231. `|`,
  232. `\\n{2,}`, // extra newlines: needed for transliterated translations
  233. `|`,
  234. `<.+?>`, // html tags
  235. `|`,
  236. `&[^ ]+;`, // html entities
  237. `)`,
  238. `\\s*` // space after
  239. ].join(''),
  240. 'sg'
  241. ),
  242. match => getReplace(match)
  243. );
  244. // encode indents between replacements
  245. // use lookahead (?=...) to include delimiter as prefix
  246. if (1) {
  247. textPart = (
  248. textPart
  249. .split(new RegExp(`(?=\\n\\[[${codeNumRegexCharClass}]+\\]\\n)`))
  250. .map(str => {
  251. //console.dir({ str });
  252. let [_, replacement, idxStr, rest] = str.match(new RegExp(`(\\n\\[([${codeNumRegexCharClass}]+)\\]\\n)(.*)$`, 's'));
  253. const replaceId = decodeNum(idxStr);
  254. //console.dir({ rest });
  255. //replacementData.indentList[replaceId] = [];
  256. replacementData.replacementList[replaceId].indentList = [];
  257. // remove indents
  258. rest = rest.split('\n').map(line => {
  259. const [_, indent, lineRest] = line.match(/^(\s*)(.*)/);
  260. replacementData.replacementList[replaceId].indentList.push(indent);
  261. return lineRest;
  262. }).join('\n');
  263. return replacement + rest;
  264. })
  265. ).join('');
  266. }
  267. if (showDebug) console.log(`textPart after replace:\n${textPart}`);
  268. textParts.push(textPart);
  269. })
  270. } // done loop parentNodes
  271. //console.dir(replacementData.indentList);
  272. //if (fileIdx > 1) process.exit(0); // DEBUG
  273. }); // done loop input files
  274. if (1 && showDebug) {
  275. for (const id of Object.keys(replacementData.replacementList)) { // sparse array
  276. console.log(`[${encodeNum(id)}] = id ${id} = ${replacementData.replacementList[id].value}`)
  277. }
  278. }
  279. if (dryRunExport) return;
  280. // generate links
  281. let lastGroupSize = 0;
  282. const textGroups = (
  283. textParts.reduce((acc, val) => {
  284. const nextLen = acc[acc.length - 1].length + val.length + 3*(`\n\n<meta attrrrrrrrr="vallll"/>\n\n`.length);
  285. if (nextLen >= charLimit) {
  286. acc.push('');
  287. lastGroupSize = 0;
  288. }
  289. acc[acc.length - 1] += val + '\n\n';
  290. lastGroupSize++;
  291. return acc;
  292. }, [''])
  293. /* DEBUG is this broken?
  294. // group siblings
  295. .map(textGroup => textGroup.replace(/\n(?:\[[\d ]+\]\s*){2,}\n/sg, matchStr => {
  296. const replaceIdList = [];
  297. // preserve extra whitespace between replacements
  298. matchStr.replace(/(\s*)\n\[([\d ]+)\]\n(\s*)/g, (_, spaceBefore, idStr, spaceAfter) => {
  299. const replaceId = parseInt(idStr.replace(/ /g, ''));
  300. if (0 && showDebug) console.dir({ replaceId }); // verbose
  301. replacementList[replaceId] = (
  302. spaceBefore + replacementList[replaceId] + spaceAfter
  303. );
  304. replaceIdList.push(replaceId);
  305. });
  306. if (0 && showDebug) console.dir({ matchStr, replaceIdList }); // verbose
  307. const firstId = replaceIdList.shift();
  308. // move all replacements to firstId
  309. for (const replaceId of replaceIdList) {
  310. replacementList[firstId] += replacementList[replaceId];
  311. replacementList[replaceId] = '';
  312. }
  313. return `\n[${fmtNum(firstId)}]\n`;
  314. }))
  315. */
  316. );
  317. if (showDebug) {
  318. console.log(textGroups.map((s, i) => `textGroup ${i}:\n${s}\n`).join('\n'));
  319. }
  320. const translateUrl = t => (
  321. translatorName == 'google' ? `https://translate.google.com/?sl=${sourceLang}&tl=${targetLang}&text=${encodeURIComponent(t)}&op=translate` :
  322. translatorName == 'deepl' ? `https://www.deepl.com/translator#${sourceLang}/${targetLang}/${encodeURIComponent(deeplBackslashEncode(t))}` :
  323. '#invalid-translatorName'
  324. );
  325. const translateLinks = textGroups.map(t => (
  326. `<li><a target="_blank" href="${translateUrl(t)}">${htmlEntities.encode(t.slice(0, previewTextLength/2))} ... ${htmlEntities.encode(t.slice(-previewTextLength/2))}</a></li>`
  327. ));
  328. const htmlSrc = (
  329. '<style>' +
  330. 'a:visited { color: green; }' +
  331. 'a { text-decoration: none; }' +
  332. 'a:hover { text-decoration: underline; }' +
  333. 'li { margin-bottom: 1em; }' +
  334. '</style>' +
  335. '<ol>\n\n' + translateLinks.join('\n\n') + '</ol>\n' +
  336. // embed replacements in html comment
  337. '<!-- replacementData = ' +
  338. JSON.stringify(replacementData, null, 2) +
  339. ' = replacementData -->'
  340. );
  341. fs.writeFileSync(htmlFile, htmlSrc, 'utf8');
  342. const htmlFileUrl = encodeURI('file://' + path.resolve(htmlFile));
  343. console.log(`output: ${htmlFile}`);
  344. console.log(`
  345. next steps:
  346. 1. open in your browser:
  347. ${htmlFileUrl}
  348. 2. click the first link
  349. 3. fix the translation on the translator website,
  350. so the translator can learn to translate better
  351. 4. scroll down, on the bottom right, click on: copy translation
  352. 5. paste the translation to your text editor
  353. remove the footers:
  354. Translated with www.DeepL.com/Translator (free version)
  355. 6. repeat for all links (append translations to text file)
  356. 7. save the text file, for example as translate-${sourceLang}2${targetLang}.txt
  357. 8. run this script again with the text file, for example:
  358. node ${scriptPath} ${sourceLang} ${targetLang} translate-${sourceLang}2${targetLang}.txt
  359. 9. add the new language code to src/_data/metadata.js -> metadata.languages
  360. 10. restart the dev server (to reload metadata.js. hot reload is not working here)
  361. 11. commit the new translation:
  362. git add src/pages/ src/_data/metadata.js
  363. git commit -m 'alchi-book: add ${targetLang} translation'
  364. 12. commit the new build:
  365. # stop the dev server
  366. npm run build
  367. git add build/
  368. git commit -m 'alchi-book: update build'
  369. note:
  370. translators will change the order of words,
  371. so in some cases, html markup tags like <b>....</b>
  372. will be in a wrong position.
  373. note:
  374. the ${htmlFile} file is valid only for one iteration.
  375. if you added nodes to the html files,
  376. then you must generate a new ${htmlFile} file
  377. `)
  378. }
  379. /////////////////////// import ////////////////////////////
  380. function importLang(sourceLang, targetLang, inputFile) {
  381. let input = fs.readFileSync(inputFile, 'utf8');
  382. // remove unwanted characters
  383. input = input.replace(new RegExp(`[${removeRegexCharClass}]`, 'g'), '');
  384. // decode replacements
  385. const htmlFile = `translate-${sourceLang}2${targetLang}.html`;
  386. if (fs.existsSync(htmlFile) == false) {
  387. console.log(`error: html file not found: ${htmlFile}`);
  388. console.log('this file is required to decode replacements');
  389. process.exit(1);
  390. }
  391. const htmlSrc = fs.readFileSync(htmlFile, 'utf8');
  392. const replacementDataMatch = htmlSrc.match(/<!-- replacementData = (.*) = replacementData -->/s);
  393. if (replacementDataMatch == null) {
  394. console.log(`parse error: replacementData not found in ${htmlFile}`);
  395. process.exit(1);
  396. }
  397. const replacementData = JSON.parse(replacementDataMatch[1]);
  398. console.log(`loaded ${Object.keys(replacementData.replacementList).length} replacements from ${htmlFile}`);
  399. if (1 && showDebug) {
  400. for (const id of Object.keys(replacementData.replacementList)) { // sparse array
  401. console.dir({ id, replacement: replacementData.replacementList[id] })
  402. console.log(`[${encodeNum(id)}] = id ${id} = ${replacementData.replacementList[id].value}`)
  403. }
  404. }
  405. // quickfix to restore newlines around replacements
  406. input = ('\n' + input + '\n');
  407. // decode replacements and indents
  408. // copy pasta ...
  409. // use lookahead (?=...) to include delimiter as prefix
  410. // \n? -> allow missing newlines around replacements -> parse errors! -> revoke
  411. let lastReplaceId = -1;
  412. input = (
  413. input
  414. .split(new RegExp(`(?=\\n\\[[${codeNumRegexCharClassImport}]+\\]\\n)`)) // add \n to match
  415. .map((str, blockIdx) => {
  416. console.dir({ blockStr: str });
  417. const m = str.match(new RegExp(`(\\n?\\[([${codeNumRegexCharClassImport}]+)\\]\\n?)(.*)$`, 's')); // optional \n
  418. //const m = str.match(new RegExp(`(\\n\\[([${codeNumRegexCharClassImport}]+)\\]\\n)(.*)$`, 's')); // require \n
  419. if (!m) return str; // no replace
  420. let [_, _replacement, idxStr, rest] = m;
  421. const replaceId = decodeNum(idxStr);
  422. // test for steadiness = simple validation
  423. if (replaceId < lastReplaceId) {
  424. console.log(`error: replaceId is not steady. did the translator break our code?`)
  425. console.log(`lastReplaceId = ${lastReplaceId}`)
  426. console.log(`replaceId = ${replaceId}`)
  427. }
  428. if (showDebug) console.dir({ _replacement, idxStr, replaceId, rest });
  429. if (!replacementData.replacementList[replaceId]) {
  430. console.log(`error: invalid replaceId ${replaceId}. did the translator break our code?`)
  431. console.dir({ _replacement, idxStr, replaceId, rest });
  432. console.log(`last valid replaceId was ${lastReplaceId}`)
  433. // TODO show context in sourceText?
  434. // we are looking for the original replacement-code
  435. // TODOODODOODODOODOD
  436. const nextSafeId = getNextSafeId(lastReplaceId);
  437. console.log(`next safe replaceId would be ${nextSafeId}:`)
  438. console.dir(replacementData.replacementList[nextSafeId])
  439. console.log(`wild guess: the translator translated our code ${idxStr} to ${replacementData.replacementList[nextSafeId].code}`)
  440. console.dir({ originalCode: replacementData.replacementList[nextSafeId].code, translatedCode: idxStr });
  441. console.log(`error: failed to decode replacements`);
  442. process.exit(1); // fatal error
  443. // its a bad solution to guess the right replaceId
  444. // cos in many cases, this will fail silently, and produce errors in the result
  445. // we can reduce the change of "false positives"
  446. // by making the replacement ids more sparse, or random
  447. }
  448. if (replacementData.replacementList[replaceId].consumed) {
  449. console.log(`error: duplicate replaceId ${replaceId}`);
  450. console.dir({ _replacement, idxStr, replaceId, rest });
  451. console.log(`replacement was already consumed in:`);
  452. console.dir(replacementData.replacementList[replaceId].consumedBy);
  453. console.log(`error: failed to decode replacements`);
  454. process.exit(1); // fatal error
  455. }
  456. // restore indents
  457. // FIXME make this more robust against newline removing (transliterated translations)
  458. //console.dir({ indentsForBlock: replacementData.indentList[replaceId] });
  459. rest = rest.split('\n').map((lineRest, lineIdx) => {
  460. //const indent = replacementData.indentList[replaceId][lineIdx] || '';
  461. const indent = replacementData.replacementList[replaceId].indentList[lineIdx] || '';
  462. //console.dir({ replaceId, lineIdx, indent, lineRest });
  463. return indent + lineRest;
  464. }).join('\n');
  465. // decode replacement
  466. const replacement = replacementData.replacementList[replaceId].value;
  467. replacementData.replacementList[replaceId].consumed = true; // prevent collisions
  468. replacementData.replacementList[replaceId].consumedBy = { blockIdx, blockStr: str };
  469. //console.dir({ replaceId, replacement, rest });
  470. lastReplaceId = replaceId;
  471. return replacement + rest;
  472. })
  473. ).join('');
  474. if (showDebug) {
  475. console.log('decoded html:');
  476. console.log(input);
  477. //return; // DEBUG
  478. }
  479. // validate html
  480. const numOpen = input.match(/<html/g).length;
  481. const numClose = input.match(/<\/html>/g).length;
  482. if (numOpen != numClose) {
  483. console.log(`ERROR: html tags mismatch in ${inputFile}:`);
  484. console.log(`open <html> tags: ${numOpen}`);
  485. console.log(`close </html> tags: ${numClose}`);
  486. // locate first mismatch
  487. // assume: html tags are not nested
  488. const idxList = {};
  489. input.replace(/<html/g, (_, idx) => (idxList[idx] = 1));
  490. input.replace(/<\/html>/g, (_, idx) => (idxList[idx] = 0));
  491. let level = 0;
  492. // this is a mess :D
  493. const idxIdxList = Object.keys(idxList);
  494. //for (const idxStr of Object.keys(idxList)) {
  495. for (let idxIdx = 0; idxIdx < idxIdxList.length; idxIdx++) {
  496. const idxStr = idxIdxList[idxIdx];
  497. const isOpen = (idxList[idxStr] == 1);
  498. if (isOpen) level++;
  499. else level--;
  500. if (level < 0 || 1 < level) {
  501. const idx = parseInt(idxStr);
  502. // TODO handle idxIdx out of range ...
  503. const last2Idx = parseInt(idxIdxList[idxIdx - 2]);
  504. const lastIdx = parseInt(idxIdxList[idxIdx - 1]);
  505. const nextIdx = parseInt(idxIdxList[idxIdx + 1]);
  506. const next2Idx = parseInt(idxIdxList[idxIdx + 2]);
  507. const m = '='.repeat(30);
  508. console.log(`html tags mismatch starts at position ${idxStr}. context:`);
  509. console.log(`${m} pos -2 = ${last2Idx} ${m}`);
  510. console.log(input.slice(last2Idx, lastIdx));
  511. console.log(`${m} pos -1 = ${lastIdx} ${m}`);
  512. console.log(input.slice(lastIdx, idx));
  513. console.log(`${m} pos 0 = ${idxStr} ${m}`);
  514. console.log(input.slice(idx, nextIdx));
  515. console.log(`${m} pos +1 = ${nextIdx} ${m}`);
  516. console.log(input.slice(nextIdx, next2Idx));
  517. console.log(`${m} pos +2 = ${next2Idx} ${m}`);
  518. break; // show only first
  519. }
  520. }
  521. console.log(`fatal error: html tags mismatch. translation was not imported`)
  522. process.exit(1);
  523. }
  524. // parse
  525. const textParts = [];
  526. const inputRest = input.replace(
  527. /<html f="([0-9]+)" p="([0-9]+)" n="([0-9]+)">\n(.+?)\n<\/html>/sg,
  528. (_, file, part, node, str) => {
  529. file = parseInt(file);
  530. part = parseInt(part);
  531. node = parseInt(node);
  532. textParts.push({ file, part, node, str });
  533. return ''; // remove from input
  534. }
  535. );
  536. // should be empty ...
  537. const inputRestTrimmed = inputRest.trim();
  538. if (inputRestTrimmed.length > 0) {
  539. console.log(`inputRest = ${JSON.stringify(inputRestTrimmed.replace(/\n+/sg, '\n'))}`);
  540. }
  541. const backupDir = `backup/translate/${nowDate}.${sourceLang}2${targetLang}`;
  542. const rollbackScriptPath = `${backupDir}/rollback.sh`;
  543. let rollbackScript = [
  544. '#!/usr/bin/env bash',
  545. '# rollback translation',
  546. 'shopt -s expand_aliases',
  547. `alias mv='mv --verbose'`,
  548. '',
  549. ].join('\n');
  550. const diffScriptPath = `${backupDir}/diff.sh`;
  551. let diffScript = [
  552. '#!/usr/bin/env bash',
  553. '# diff translation',
  554. 'shopt -s expand_aliases',
  555. `alias diff='diff --unified --color=auto'`,
  556. '',
  557. ].join('\n');
  558. console.log(`glob: ${infilesGlob}`);
  559. // we assume that source files did not change since we called exportLang()
  560. // TODO verify automatic, mismatch is fatal error
  561. let changedFiles = 0;
  562. glob.sync(infilesGlob)
  563. //.slice(0, 2) // debug: process less input files
  564. .forEach((file, fileIdx) => {
  565. //console.log(`input: ${file}`);
  566. const sourceBefore = fs.readFileSync(file, 'utf8');
  567. const root = parse(sourceBefore);
  568. const parentNodes = root.querySelectorAll('langs, .langs');
  569. let insertedNodes = 0;
  570. // loop parentNodes -> get textParts
  571. for (const [pi, p] of parentNodes.entries()) {
  572. p.querySelectorAll(`${sourceLang}, *[lang="${sourceLang}"]`).forEach((n, ni) => {
  573. const textNode = textParts.find(o => (
  574. o.file == fileIdx &&
  575. o.part == pi &&
  576. o.node == ni
  577. ));
  578. if (!textNode) {
  579. //console.log(`warning: no imported translation for ${fileIdx} ${pi} ${ni} ${n.toString().slice(0, 100)} ...`);
  580. return; // continue
  581. }
  582. n.insertAdjacentHTML('afterend', '\n' + textNode.str);
  583. insertedNodes++;
  584. if (dryRunImport) {
  585. const indent = textNode.str.match(/^\s*/)[0]; // quick n dirty ...
  586. console.log(`old node:`); console.dir({ ...textNode, str: '', file }); console.log(indent + n.toString())
  587. console.log(`new node:`); console.log(textNode.str)
  588. }
  589. })
  590. }
  591. if (!dryRunImport && insertedNodes > 0) {
  592. // move original to backup
  593. const backupFile = `${backupDir}/${file}`;
  594. fs.mkdirSync(path.dirname(backupFile), { recursive: true });
  595. fs.renameSync(file, backupFile);
  596. console.log(`output: ${backupFile}`);
  597. rollbackScript += `mv ${backupFile} ${file}\n`
  598. diffScript += `diff ${backupFile} ${file}\n`
  599. changedFiles++;
  600. //const outFile = file + '.add-' + targetLang + '.txt'; // use *.txt extension to avoid *.html glob match
  601. const outFile = file; // replace input file (after creating a backup copy)
  602. fs.writeFileSync(outFile, root.toString(), 'utf8');
  603. console.log(`output: ${outFile}`);
  604. }
  605. });
  606. if (!dryRunImport && changedFiles > 0) {
  607. rollbackScript += `mv ${backupDir} ${backupDir}.rolled-back\n`
  608. fs.writeFileSync(rollbackScriptPath, rollbackScript, 'utf8');
  609. console.log(`output: ${rollbackScriptPath}`);
  610. fs.writeFileSync(diffScriptPath, diffScript, 'utf8');
  611. console.log(`output: ${diffScriptPath}`);
  612. }
  613. }
  614. main();