diff --git a/app/dictionary-tools.gradle b/app/dictionary-tools.gradle index 125b2faa..2241bb0b 100644 --- a/app/dictionary-tools.gradle +++ b/app/dictionary-tools.gradle @@ -1,6 +1,6 @@ class Wrapper { static def getDictionaryLineData(String line, String delimiter) { - String[] parts = line.split(delimiter, 2) + String[] parts = line.split(delimiter, 3) String word = parts[0] String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : "" diff --git a/scripts/inject-dictionary-frequencies.js b/scripts/inject-dictionary-frequencies.js index 7c1c5709..f0680c23 100644 --- a/scripts/inject-dictionary-frequencies.js +++ b/scripts/inject-dictionary-frequencies.js @@ -7,7 +7,7 @@ const DELIMITER = ' '; function printHelp() { - print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt`); + print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt --transcribed`); print('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.'); print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...'); } @@ -34,12 +34,13 @@ function validateInput() { return { locale: process.argv[2], dictionaryFileName: process.argv[3], + transcribed: process.argv[5] !== undefined && process.argv[5] === '--transcribed', wordsWithFrequenciesFileName: process.argv[4] }; } -async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale }) { +async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale, transcribed }) { // read the frequencies let lineReader = require('readline').createInterface({ input: createReadStream(wordsWithFrequenciesFileName) @@ -69,9 +70,17 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale const outputWords = []; - for await (const word of lineReader) { - const lowercaseWord = word.toLocaleLowerCase(locale); - outputWords.push(`${word}${ (frequencies.get(lowercaseWord) || 0) > 0 ? DELIMITER + frequencies.get(lowercaseWord) : '' }`); + for await (const line of lineReader) { + let word = ''; + + if (transcribed) { + const parts = line.split(DELIMITER); + word = parts[0]; + } else { + word = line.toLocaleLowerCase(locale); + } + + outputWords.push(`${line}${ (frequencies.get(word) || 0) > 0 ? DELIMITER + frequencies.get(word) : '' }`); } return outputWords; diff --git a/scripts/normalize-transcribed.js b/scripts/normalize-transcribed.js new file mode 100644 index 00000000..1d7f8608 --- /dev/null +++ b/scripts/normalize-transcribed.js @@ -0,0 +1,85 @@ +const { basename } = require('path'); +const { existsSync, readFileSync } = require('fs');; +const { print, printError } = require('./_printers.js') + + +function printHelp() { + print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`); + print('Normalizes the frequencies in a dictionary with transcriptions.'); +} + + + +function validateInput() { + if (process.argv.length < 3) { + printHelp(); + process.exit(1); + } + + if (!existsSync(process.argv[2])) { + printError(`Failure! Could not find word list file "${process.argv[2]}".`); + process.exit(2); + } + + return { + fileName: process.argv[2] + }; +} + + +function printWords(wordList) { + if (Array.isArray(wordList)) { + wordList.forEach(w => print( + w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}` + )); + } +} + + +const { fileName } = validateInput(); + +const data = readFileSync(fileName, 'utf8'); +const lines = data.trim().split('\n'); + +// Parse the data into an array of objects +let entries = lines.map(line => { + const parts = line.split('\t'); + return { + original: line, + chinese: parts[0], + latin: parts[1], + number: parts[2] ? parseInt(parts[2], 10) : null + }; +}); + +// Group entries by the Latin character sequence +const groups = {}; +entries.forEach(entry => { + if (!groups[entry.latin]) { + groups[entry.latin] = []; + } + groups[entry.latin].push(entry); +}); + +// Process each group: sort by number (descending) and reassign ordinal numbers +let sortedEntries = []; +for (const key in groups) { + let group = groups[key]; + + // Separate entries with and without numbers + let withNumbers = group.filter(e => e.number !== null); + let withoutNumbers = group.filter(e => e.number === null); + + // Sort by number in descending order + withNumbers.sort((a, b) => b.number - a.number); + + // Assign ordinal rankings + for (let i = 0; i < withNumbers.length; i++) { + withNumbers[i].number = (withNumbers.length - i).toString(); + } + + // Preserve original order for entries without numbers + sortedEntries.push(...withNumbers, ...withoutNumbers); +} + +printWords(sortedEntries);