123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744 |
- // TODO translate svg files
- // TODO refactor src/pages/*.html to use <lang.de> instead of <de> etc. -> make tags unambiguous
- // e.g. <tr> can be turkish or table row
- // TODO avoid double inserts
- // TODO verify: all text-fragments are inserted
- // TODO auto-detect sourceLangList from files in src/pages/*.html
- // TODO post-process old and manual translations
- // -> add/update rev="en#xxxxxxxx" (revision ID) to keep translations in sync
- // use different encoding than base64? base32 or base16 (hex) -> better for filenames
- // -> easier to build a content-addressable store to cache old versions
- // con: text fragments are small -> use one large database file, e.g. jsonlines format
- // collision safety? git uses short IDs of only 7 chars in base16
- // 12 safe symbols: []^'*-/_{}<>
- const codeNumKey = "^'*-/_{}<>"; // 10 digits
- const codeNumRegexCharClass = "\\^'*-/_{}<>"; // escape ^ for regex character class
- const removeRegexCharClass = [
- '\u200B', // ZERO WIDTH SPACE from google https://stackoverflow.com/questions/36744793
- ].join('');
- const artifactsRegexCharClass = removeRegexCharClass + [
- ' ', // space
- ].join('');
- const codeNumRegexCharClassImport = codeNumRegexCharClass + artifactsRegexCharClass;
- const encodeNumTable = Object.fromEntries(codeNumKey.split('').map((c, i) => [i, c]));
- function encodeNum(num) {
- return num.toString().split('').map(i => encodeNumTable[i]).join('');
- }
- const decodeNumTable = Object.fromEntries(codeNumKey.split('').map((c, i) => [c, i]));
- function decodeNum(str) {
- return parseInt(str.replace(/\s+/sg, '').split('').map(c => decodeNumTable[c]).join(''));
- }
- const dryRunExport = 0;
- const dryRunImport = 0;
- const showDebug = 1;
- const charLimit = 5000; // limit of google, deepl
- //const charLimit = 1000; // good page size for manual translations or debugging
- let useXml = false;
- let translatorName = 'google';
- function main() {
- const argv = process.argv.slice(1); // argv[0] is node
- const langMap = {
- zh: 'zh-CN', // simplified chinese
- };
- function getLang(str) {
- if (str && str in langMap) return langMap[str];
- return str;
- }
- const sourceLang = getLang(argv[1]);
- const targetLang = getLang(argv[2]);
- const inputFile = argv[3];
- //const translatorName = 'google';
- translatorName = (
- translatorLangs.deepl.includes(targetLang) ? 'deepl' :
- 'google'
- );
- // DEBUG
- translatorName = 'google';
- // xml is broken in all translators
- // -> encode to "symbols in square braces"
- // which are preserved by all translators
- const xmlTranslators = [
- //'deepl', // not really. some xml is preserved, some xml is translated -> not usable
- ];
- useXml = xmlTranslators.includes(translatorName);
- if (sourceLang && targetLang && inputFile && fs.existsSync(inputFile)) {
- importLang(sourceLang, targetLang, inputFile);
- }
- else if (sourceLang && targetLang) {
- exportLang(sourceLang, targetLang);
- }
- else {
- showHelp();
- }
- }
- function showHelp() {
- const scriptName = 'scripts/' + __filename.split('/').pop();
- console.log(
- 'usage:\n' +
- `node ${scriptName} <sourceLang> <targetLang>\n` +
- `node ${scriptName} <sourceLang> <targetLang> <translationFile>\n` +
- '\n' +
- 'sample:\n' +
- `node ${scriptName} de en # from source files, generate translate-de2en.html\n` +
- `# manually create translate-de2en.txt\n` +
- `node ${scriptName} de en translate-de2en.txt # add <en auto t="${nowDate}">...</en> tags to source files\n` +
- `# manually fix the translations, and replace <en auto t="${nowDate}"> with <en>\n`
- //`node ${scriptName} translate-de2en.txt en\n`
- )
- }
- const translatorLangs = {
- deepl: [
- // 2021-05-25
- 'bg', 'zh', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'hu', 'it', 'ja',
- 'lv', 'lt', 'pl', 'pt', 'pt-PT', 'pt-BR', 'ro', 'ru', 'sk', 'sl', 'es', 'sv'
- ],
- };
- const previewTextLength = 500;
- const fs = require('fs');
- const appRoot = require('app-root-path').path;
- const path = require('path');
- const glob = require('fast-glob');
- const { parse } = require('node-html-parser'); // patched version github:taoqf/node-fast-html-parser#60ea8fee51f07fbc712b5642a0496f12748eb90f
- const htmlEntities = require('he');
- const elevConf = require(appRoot + '/config/eleventy.config.js')();
- process.chdir(appRoot);
- const scriptPath = path.relative(appRoot, process.argv[1]);
- const inputDir = elevConf.dir.input;
- const infilesGlob = inputDir + '/pages/*.html';
- const sourceLangList = ['de', 'en']; // TODO get from 11ty metadata
- // https://github.com/iansan5653/unraw/issues/29
- // deepl.com:
- // / -> \/
- // \ -> \\
- function deeplBackslashEncode(str) {
- let res = '';
- for (let i = 0; i < str.length; i++) {
- const char16bit = str[i];
- const code = char16bit.charCodeAt(0);
- res += (
- (code == 47) ? '\\/' : // forward slash
- (code == 92) ? '\\\\' : // backslash
- char16bit
- );
- }
- return res;
- }
- function dateTime(date = null) {
- // sample result: '2021-03-21.21-05-36'
- if (!date) date = new Date();
- return date.toLocaleString('lt').replace(/:/g, '-').replace(' ', '.');
- }
- const nowDate = dateTime();
- const crypto = require("crypto");
- function sha1sum(str) {
- return crypto.createHash("sha1").update(str).digest("base64");
- }
- // google can translate -- to -
- // so we use "safe" ids without repetition
- function getNextSafeId(lastId) {
- for (let id = (lastId + 1); ; id++) {
- let idStr = id.toString();
- let idSafe = true;
- for (let charIdx = 0; charIdx < (idStr.length - 1); charIdx++) {
- if (idStr[charIdx] == idStr[charIdx + 1]) {
- // found repetition
- idSafe = false;
- //if (showDebug) console.log(`skip unsafe id ${id}`);
- break;
- }
- }
- if (idSafe) return id;
- }
- }
- /////////////////////// export ////////////////////////////
- function exportLang(sourceLang = 'de', targetLang = 'en') {
- // TODO more dynamic ...
- // check src/pages/*.html if sourceLang is found
- if (sourceLangList.includes(sourceLang) == false) {
- console.log(`error: sourceLang ${sourceLang} not found. must be one of: ${sourceLangList.join(', ')}`);
- process.exit(1);
- }
- const htmlFile = `translate-${sourceLang}2${targetLang}.html`;
- if (!dryRunExport && fs.existsSync(htmlFile)) {
- console.log(`error: output file exists: ${htmlFile}`);
- console.log(`\nsolutions:`);
- console.log(`mv ${htmlFile} ${htmlFile}.${nowDate}.bak`);
- console.log(`rm ${htmlFile}`);
- process.exit(1);
- }
- console.log(`glob: ${infilesGlob}`);
- const textParts = [];
- //const replacementList = [];
- const replacementData = {};
- replacementData.replacementList = {}; // sparse array due to "safe" ids, see getNextSafeId
- //replacementData.indentList = [];
- replacementData.lastId = -1;
- function fmtNum(num) {
- // split long number in groups of three digits
- // https://stackoverflow.com/a/6786040/10440128
- return `${num}`.replace(/(\d)(?=(\d{3})+$)/g, '$1 ');
- }
- function getReplace(match) {
- // global: replacementData
- const replacementId = getNextSafeId(replacementData.lastId);
- replacementData.lastId = replacementId;
- replacementData.replacementList[replacementId] = {};
- replacementData.replacementList[replacementId].value = match;
- replacementData.replacementList[replacementId].code = encodeNum(replacementId);
- replacementData.replacementList[replacementId].indentList = [];
- return `\n[${encodeNum(replacementId)}]\n`;
- }
- // loop input files
- glob.sync(infilesGlob)
- //.slice(0, 2) // debug: process less input files
- .forEach((file, fileIdx) => {
- console.log(`input: ${file}`);
- const inputHtml = fs.readFileSync(file, 'utf8');
- const root = parse(inputHtml);
- const parentNodes = root.querySelectorAll('langs, .langs');
- // loop parentNodes -> get textParts
- for (const [pi, p] of parentNodes.entries()) {
- // loop nodes
- p.querySelectorAll(`${sourceLang}, *[lang="${sourceLang}"]`).forEach((n, ni) => {
- const wrap = (n.hasAttribute('lang') == false);
- const nodeStart = n._source.start; // only in patched version of html parser
- const lineStart = inputHtml.lastIndexOf('\n', n._source.start) + 1;
- const indent = inputHtml.slice(lineStart, nodeStart).match(/^\s*/)[0];
- //if (showDebug) console.log(`indent = ${JSON.stringify(indent)}`);
- //const tagName = targetLang;
- const tagName = `lang.${targetLang}`;
- //const tagAttrs = `generator="${translatorName}" t="${nowDate}"`;
- // base of translation = sourceText
- const base = `${sourceLang}#${sha1sum(n.innerHTML).slice(0, 8)}`;
- const extraAttrs = `rev="${base}"`; // add revision ID
- // TODO properly parse + replace attributes if wrap == false
- const sBase = indent + (wrap
- ? `<${tagName} ${extraAttrs}>${n.innerHTML}</${tagName}>`
- : n.outerHTML.replace(new RegExp(`^<([^>\\s]+)\\s+[^>]*lang="${sourceLang}"[^>]*>`, 's'), `<$1 lang="${targetLang}" ${extraAttrs}>`)
- );
- if (showDebug) console.dir({ indent, wrap, tagName, extraAttrs });
- const sXml = `<html f="${fileIdx}" p="${pi}" n="${ni}">\n${sBase}\n</html>`;
- if (showDebug) console.log(`textPart before replace:\n${sXml}`);
- // encode html
- // replace with "symbols in square braces"
- // consume all whitespace around the original value
- let textPart = sXml.replace(
- new RegExp(
- [
- `\\s*`, // space before
- `(?:`,
- `\\[[${codeNumRegexCharClassImport}]+\\]`, // "symbols in square braces"
- `|`,
- `\\n{2,}`, // extra newlines: needed for transliterated translations
- `|`,
- `<.+?>`, // html tags
- `|`,
- `&[^ ]+;`, // html entities
- `)`,
- `\\s*` // space after
- ].join(''),
- 'sg'
- ),
- match => getReplace(match)
- );
- // encode indents between replacements
- // use lookahead (?=...) to include delimiter as prefix
- if (1) {
- textPart = (
- textPart
- .split(new RegExp(`(?=\\n\\[[${codeNumRegexCharClass}]+\\]\\n)`))
- .map(str => {
- //console.dir({ str });
- let [_, replacement, idxStr, rest] = str.match(new RegExp(`(\\n\\[([${codeNumRegexCharClass}]+)\\]\\n)(.*)$`, 's'));
- const replaceId = decodeNum(idxStr);
- //console.dir({ rest });
- //replacementData.indentList[replaceId] = [];
- replacementData.replacementList[replaceId].indentList = [];
- // remove indents
- rest = rest.split('\n').map(line => {
- const [_, indent, lineRest] = line.match(/^(\s*)(.*)/);
- replacementData.replacementList[replaceId].indentList.push(indent);
- return lineRest;
- }).join('\n');
- return replacement + rest;
- })
- ).join('');
- }
- if (showDebug) console.log(`textPart after replace:\n${textPart}`);
- textParts.push(textPart);
- })
- } // done loop parentNodes
- //console.dir(replacementData.indentList);
- //if (fileIdx > 1) process.exit(0); // DEBUG
- }); // done loop input files
- if (1 && showDebug) {
- for (const id of Object.keys(replacementData.replacementList)) { // sparse array
- console.log(`[${encodeNum(id)}] = id ${id} = ${replacementData.replacementList[id].value}`)
- }
- }
- if (dryRunExport) return;
- // generate links
- let lastGroupSize = 0;
- const textGroups = (
- textParts.reduce((acc, val) => {
- const nextLen = acc[acc.length - 1].length + val.length + 3*(`\n\n<meta attrrrrrrrr="vallll"/>\n\n`.length);
- if (nextLen >= charLimit) {
- acc.push('');
- lastGroupSize = 0;
- }
- acc[acc.length - 1] += val + '\n\n';
- lastGroupSize++;
- return acc;
- }, [''])
- /* DEBUG is this broken?
- // group siblings
- .map(textGroup => textGroup.replace(/\n(?:\[[\d ]+\]\s*){2,}\n/sg, matchStr => {
- const replaceIdList = [];
- // preserve extra whitespace between replacements
- matchStr.replace(/(\s*)\n\[([\d ]+)\]\n(\s*)/g, (_, spaceBefore, idStr, spaceAfter) => {
- const replaceId = parseInt(idStr.replace(/ /g, ''));
- if (0 && showDebug) console.dir({ replaceId }); // verbose
- replacementList[replaceId] = (
- spaceBefore + replacementList[replaceId] + spaceAfter
- );
- replaceIdList.push(replaceId);
- });
- if (0 && showDebug) console.dir({ matchStr, replaceIdList }); // verbose
- const firstId = replaceIdList.shift();
- // move all replacements to firstId
- for (const replaceId of replaceIdList) {
- replacementList[firstId] += replacementList[replaceId];
- replacementList[replaceId] = '';
- }
- return `\n[${fmtNum(firstId)}]\n`;
- }))
- */
- );
- if (showDebug) {
- console.log(textGroups.map((s, i) => `textGroup ${i}:\n${s}\n`).join('\n'));
- }
- const translateUrl = t => (
- translatorName == 'google' ? `https://translate.google.com/?sl=${sourceLang}&tl=${targetLang}&text=${encodeURIComponent(t)}&op=translate` :
- translatorName == 'deepl' ? `https://www.deepl.com/translator#${sourceLang}/${targetLang}/${encodeURIComponent(deeplBackslashEncode(t))}` :
- '#invalid-translatorName'
- );
- const translateLinks = textGroups.map(t => (
- `<li><a target="_blank" href="${translateUrl(t)}">${htmlEntities.encode(t.slice(0, previewTextLength/2))} ... ${htmlEntities.encode(t.slice(-previewTextLength/2))}</a></li>`
- ));
- const htmlSrc = (
- '<style>' +
- 'a:visited { color: green; }' +
- 'a { text-decoration: none; }' +
- 'a:hover { text-decoration: underline; }' +
- 'li { margin-bottom: 1em; }' +
- '</style>' +
- '<ol>\n\n' + translateLinks.join('\n\n') + '</ol>\n' +
- // embed replacements in html comment
- '<!-- replacementData = ' +
- JSON.stringify(replacementData, null, 2) +
- ' = replacementData -->'
- );
- fs.writeFileSync(htmlFile, htmlSrc, 'utf8');
- const htmlFileUrl = encodeURI('file://' + path.resolve(htmlFile));
- console.log(`output: ${htmlFile}`);
- console.log(`
- next steps:
- 1. open in your browser:
- ${htmlFileUrl}
- 2. click the first link
- 3. fix the translation on the translator website,
- so the translator can learn to translate better
- 4. scroll down, on the bottom right, click on: copy translation
- 5. paste the translation to your text editor
- remove the footers:
- Translated with www.DeepL.com/Translator (free version)
- 6. repeat for all links (append translations to text file)
- 7. save the text file, for example as translate-${sourceLang}2${targetLang}.txt
- 8. run this script again with the text file, for example:
- node ${scriptPath} ${sourceLang} ${targetLang} translate-${sourceLang}2${targetLang}.txt
- 9. add the new language code to src/_data/metadata.js -> metadata.languages
- 10. restart the dev server (to reload metadata.js. hot reload is not working here)
- 11. commit the new translation:
- git add src/pages/ src/_data/metadata.js
- git commit -m 'alchi-book: add ${targetLang} translation'
- 12. commit the new build:
- # stop the dev server
- npm run build
- git add build/
- git commit -m 'alchi-book: update build'
- note:
- translators will change the order of words,
- so in some cases, html markup tags like <b>....</b>
- will be in a wrong position.
- note:
- the ${htmlFile} file is valid only for one iteration.
- if you added nodes to the html files,
- then you must generate a new ${htmlFile} file
- `)
- }
- /////////////////////// import ////////////////////////////
- function importLang(sourceLang, targetLang, inputFile) {
- let input = fs.readFileSync(inputFile, 'utf8');
- // remove unwanted characters
- input = input.replace(new RegExp(`[${removeRegexCharClass}]`, 'g'), '');
- // decode replacements
- const htmlFile = `translate-${sourceLang}2${targetLang}.html`;
- if (fs.existsSync(htmlFile) == false) {
- console.log(`error: html file not found: ${htmlFile}`);
- console.log('this file is required to decode replacements');
- process.exit(1);
- }
- const htmlSrc = fs.readFileSync(htmlFile, 'utf8');
- const replacementDataMatch = htmlSrc.match(/<!-- replacementData = (.*) = replacementData -->/s);
- if (replacementDataMatch == null) {
- console.log(`parse error: replacementData not found in ${htmlFile}`);
- process.exit(1);
- }
- const replacementData = JSON.parse(replacementDataMatch[1]);
- console.log(`loaded ${Object.keys(replacementData.replacementList).length} replacements from ${htmlFile}`);
- if (1 && showDebug) {
- for (const id of Object.keys(replacementData.replacementList)) { // sparse array
- console.dir({ id, replacement: replacementData.replacementList[id] })
- console.log(`[${encodeNum(id)}] = id ${id} = ${replacementData.replacementList[id].value}`)
- }
- }
- // quickfix to restore newlines around replacements
- input = ('\n' + input + '\n');
- // decode replacements and indents
- // copy pasta ...
- // use lookahead (?=...) to include delimiter as prefix
- // \n? -> allow missing newlines around replacements -> parse errors! -> revoke
- let lastReplaceId = -1;
- input = (
- input
- .split(new RegExp(`(?=\\n\\[[${codeNumRegexCharClassImport}]+\\]\\n)`)) // add \n to match
- .map((str, blockIdx) => {
- console.dir({ blockStr: str });
- const m = str.match(new RegExp(`(\\n?\\[([${codeNumRegexCharClassImport}]+)\\]\\n?)(.*)$`, 's')); // optional \n
- //const m = str.match(new RegExp(`(\\n\\[([${codeNumRegexCharClassImport}]+)\\]\\n)(.*)$`, 's')); // require \n
- if (!m) return str; // no replace
- let [_, _replacement, idxStr, rest] = m;
- const replaceId = decodeNum(idxStr);
- // test for steadiness = simple validation
- if (replaceId < lastReplaceId) {
- console.log(`error: replaceId is not steady. did the translator break our code?`)
- console.log(`lastReplaceId = ${lastReplaceId}`)
- console.log(`replaceId = ${replaceId}`)
- }
- if (showDebug) console.dir({ _replacement, idxStr, replaceId, rest });
- if (!replacementData.replacementList[replaceId]) {
- console.log(`error: invalid replaceId ${replaceId}. did the translator break our code?`)
- console.dir({ _replacement, idxStr, replaceId, rest });
- console.log(`last valid replaceId was ${lastReplaceId}`)
- // TODO show context in sourceText?
- // we are looking for the original replacement-code
- // TODOODODOODODOODOD
- const nextSafeId = getNextSafeId(lastReplaceId);
- console.log(`next safe replaceId would be ${nextSafeId}:`)
- console.dir(replacementData.replacementList[nextSafeId])
- console.log(`wild guess: the translator translated our code ${idxStr} to ${replacementData.replacementList[nextSafeId].code}`)
- console.dir({ originalCode: replacementData.replacementList[nextSafeId].code, translatedCode: idxStr });
- console.log(`error: failed to decode replacements`);
- process.exit(1); // fatal error
- // its a bad solution to guess the right replaceId
- // cos in many cases, this will fail silently, and produce errors in the result
- // we can reduce the change of "false positives"
- // by making the replacement ids more sparse, or random
- }
- if (replacementData.replacementList[replaceId].consumed) {
- console.log(`error: duplicate replaceId ${replaceId}`);
- console.dir({ _replacement, idxStr, replaceId, rest });
- console.log(`replacement was already consumed in:`);
- console.dir(replacementData.replacementList[replaceId].consumedBy);
- console.log(`error: failed to decode replacements`);
- process.exit(1); // fatal error
- }
- // restore indents
- // FIXME make this more robust against newline removing (transliterated translations)
- //console.dir({ indentsForBlock: replacementData.indentList[replaceId] });
- rest = rest.split('\n').map((lineRest, lineIdx) => {
- //const indent = replacementData.indentList[replaceId][lineIdx] || '';
- const indent = replacementData.replacementList[replaceId].indentList[lineIdx] || '';
- //console.dir({ replaceId, lineIdx, indent, lineRest });
- return indent + lineRest;
- }).join('\n');
- // decode replacement
- const replacement = replacementData.replacementList[replaceId].value;
- replacementData.replacementList[replaceId].consumed = true; // prevent collisions
- replacementData.replacementList[replaceId].consumedBy = { blockIdx, blockStr: str };
- //console.dir({ replaceId, replacement, rest });
- lastReplaceId = replaceId;
- return replacement + rest;
- })
- ).join('');
- if (showDebug) {
- console.log('decoded html:');
- console.log(input);
- //return; // DEBUG
- }
- // validate html
- const numOpen = input.match(/<html/g).length;
- const numClose = input.match(/<\/html>/g).length;
- if (numOpen != numClose) {
- console.log(`ERROR: html tags mismatch in ${inputFile}:`);
- console.log(`open <html> tags: ${numOpen}`);
- console.log(`close </html> tags: ${numClose}`);
- // locate first mismatch
- // assume: html tags are not nested
- const idxList = {};
- input.replace(/<html/g, (_, idx) => (idxList[idx] = 1));
- input.replace(/<\/html>/g, (_, idx) => (idxList[idx] = 0));
- let level = 0;
- // this is a mess :D
- const idxIdxList = Object.keys(idxList);
- //for (const idxStr of Object.keys(idxList)) {
- for (let idxIdx = 0; idxIdx < idxIdxList.length; idxIdx++) {
- const idxStr = idxIdxList[idxIdx];
- const isOpen = (idxList[idxStr] == 1);
- if (isOpen) level++;
- else level--;
- if (level < 0 || 1 < level) {
- const idx = parseInt(idxStr);
- // TODO handle idxIdx out of range ...
- const last2Idx = parseInt(idxIdxList[idxIdx - 2]);
- const lastIdx = parseInt(idxIdxList[idxIdx - 1]);
- const nextIdx = parseInt(idxIdxList[idxIdx + 1]);
- const next2Idx = parseInt(idxIdxList[idxIdx + 2]);
- const m = '='.repeat(30);
- console.log(`html tags mismatch starts at position ${idxStr}. context:`);
- console.log(`${m} pos -2 = ${last2Idx} ${m}`);
- console.log(input.slice(last2Idx, lastIdx));
- console.log(`${m} pos -1 = ${lastIdx} ${m}`);
- console.log(input.slice(lastIdx, idx));
- console.log(`${m} pos 0 = ${idxStr} ${m}`);
- console.log(input.slice(idx, nextIdx));
- console.log(`${m} pos +1 = ${nextIdx} ${m}`);
- console.log(input.slice(nextIdx, next2Idx));
- console.log(`${m} pos +2 = ${next2Idx} ${m}`);
- break; // show only first
- }
- }
- console.log(`fatal error: html tags mismatch. translation was not imported`)
- process.exit(1);
- }
- // parse
- const textParts = [];
- const inputRest = input.replace(
- /<html f="([0-9]+)" p="([0-9]+)" n="([0-9]+)">\n(.+?)\n<\/html>/sg,
- (_, file, part, node, str) => {
- file = parseInt(file);
- part = parseInt(part);
- node = parseInt(node);
- textParts.push({ file, part, node, str });
- return ''; // remove from input
- }
- );
- // should be empty ...
- const inputRestTrimmed = inputRest.trim();
- if (inputRestTrimmed.length > 0) {
- console.log(`inputRest = ${JSON.stringify(inputRestTrimmed.replace(/\n+/sg, '\n'))}`);
- }
- const backupDir = `backup/translate/${nowDate}.${sourceLang}2${targetLang}`;
- const rollbackScriptPath = `${backupDir}/rollback.sh`;
- let rollbackScript = [
- '#!/usr/bin/env bash',
- '# rollback translation',
- 'shopt -s expand_aliases',
- `alias mv='mv --verbose'`,
- '',
- ].join('\n');
- const diffScriptPath = `${backupDir}/diff.sh`;
- let diffScript = [
- '#!/usr/bin/env bash',
- '# diff translation',
- 'shopt -s expand_aliases',
- `alias diff='diff --unified --color=auto'`,
- '',
- ].join('\n');
- console.log(`glob: ${infilesGlob}`);
- // we assume that source files did not change since we called exportLang()
- // TODO verify automatic, mismatch is fatal error
- let changedFiles = 0;
- glob.sync(infilesGlob)
- //.slice(0, 2) // debug: process less input files
- .forEach((file, fileIdx) => {
- //console.log(`input: ${file}`);
- const sourceBefore = fs.readFileSync(file, 'utf8');
- const root = parse(sourceBefore);
- const parentNodes = root.querySelectorAll('langs, .langs');
- let insertedNodes = 0;
- // loop parentNodes -> get textParts
- for (const [pi, p] of parentNodes.entries()) {
- p.querySelectorAll(`${sourceLang}, *[lang="${sourceLang}"]`).forEach((n, ni) => {
- const textNode = textParts.find(o => (
- o.file == fileIdx &&
- o.part == pi &&
- o.node == ni
- ));
- if (!textNode) {
- //console.log(`warning: no imported translation for ${fileIdx} ${pi} ${ni} ${n.toString().slice(0, 100)} ...`);
- return; // continue
- }
- n.insertAdjacentHTML('afterend', '\n' + textNode.str);
- insertedNodes++;
- if (dryRunImport) {
- const indent = textNode.str.match(/^\s*/)[0]; // quick n dirty ...
- console.log(`old node:`); console.dir({ ...textNode, str: '', file }); console.log(indent + n.toString())
- console.log(`new node:`); console.log(textNode.str)
- }
- })
- }
- if (!dryRunImport && insertedNodes > 0) {
- // move original to backup
- const backupFile = `${backupDir}/${file}`;
- fs.mkdirSync(path.dirname(backupFile), { recursive: true });
- fs.renameSync(file, backupFile);
- console.log(`output: ${backupFile}`);
- rollbackScript += `mv ${backupFile} ${file}\n`
- diffScript += `diff ${backupFile} ${file}\n`
- changedFiles++;
- //const outFile = file + '.add-' + targetLang + '.txt'; // use *.txt extension to avoid *.html glob match
- const outFile = file; // replace input file (after creating a backup copy)
- fs.writeFileSync(outFile, root.toString(), 'utf8');
- console.log(`output: ${outFile}`);
- }
- });
- if (!dryRunImport && changedFiles > 0) {
- rollbackScript += `mv ${backupDir} ${backupDir}.rolled-back\n`
- fs.writeFileSync(rollbackScriptPath, rollbackScript, 'utf8');
- console.log(`output: ${rollbackScriptPath}`);
- fs.writeFileSync(diffScriptPath, diffScript, 'utf8');
- console.log(`output: ${diffScriptPath}`);
- }
- }
- main();
|