added support for frequencies in dictionaries with transcriptions

2025-03-18 19:19:57 +02:00 · 2025-03-18 19:19:57 +02:00 · 51cd39fe27
commit 51cd39fe27
parent 795145fb2b
3 changed files with 100 additions and 6 deletions
--- a/app/dictionary-tools.gradle
+++ b/app/dictionary-tools.gradle
@ -1,6 +1,6 @@
 class Wrapper {
 	static def getDictionaryLineData(String line, String delimiter) {
-		String[] parts = line.split(delimiter, 2)
+		String[] parts = line.split(delimiter, 3)
 		String word = parts[0]
 		String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : ""

--- a/scripts/inject-dictionary-frequencies.js
+++ b/scripts/inject-dictionary-frequencies.js
@ -7,7 +7,7 @@ const DELIMITER = '	';


 function printHelp() {
-	print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt`);
+	print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt --transcribed`);
 	print('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.');
 	print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
 }
@ -34,12 +34,13 @@ function validateInput() {
 	return {
 		locale: process.argv[2],
 		dictionaryFileName: process.argv[3],
+		transcribed: process.argv[5] !== undefined && process.argv[5] === '--transcribed',
 		wordsWithFrequenciesFileName: process.argv[4]
 	};
 }


-async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale }) {
+async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale, transcribed }) {
 	// read the frequencies
 	let lineReader = require('readline').createInterface({
 	  input: createReadStream(wordsWithFrequenciesFileName)
@ -69,9 +70,17 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale


 	const outputWords = [];
-	for await (const word of lineReader) {
-		const lowercaseWord = word.toLocaleLowerCase(locale);
-		outputWords.push(`${word}${ (frequencies.get(lowercaseWord) || 0) > 0 ? DELIMITER + frequencies.get(lowercaseWord) : '' }`);
+	for await (const line of lineReader) {
+		let word = '';
+
+		if (transcribed) {
+			const parts = line.split(DELIMITER);
+			word = parts[0];
+		} else {
+			word = line.toLocaleLowerCase(locale);
+		}
+
+		outputWords.push(`${line}${ (frequencies.get(word) || 0) > 0 ? DELIMITER + frequencies.get(word) : '' }`);
 	}

 	return outputWords;
--- a/scripts/normalize-transcribed.js
+++ b/scripts/normalize-transcribed.js
@ -0,0 +1,85 @@
+const { basename } = require('path');
+const { existsSync, readFileSync } = require('fs');;
+const { print, printError } = require('./_printers.js')
+
+
+function printHelp() {
+	print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
+	print('Normalizes the frequencies in a dictionary with transcriptions.');
+}
+
+
+
+function validateInput() {
+	if (process.argv.length < 3) {
+		printHelp();
+		process.exit(1);
+	}
+
+	if (!existsSync(process.argv[2])) {
+		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
+		process.exit(2);
+	}
+
+	return {
+		fileName: process.argv[2]
+	};
+}
+
+
+function printWords(wordList) {
+	if (Array.isArray(wordList)) {
+		wordList.forEach(w => print(
+			w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}`
+		));
+	}
+}
+
+
+const { fileName } = validateInput();
+
+const data = readFileSync(fileName, 'utf8');
+const lines = data.trim().split('\n');
+
+// Parse the data into an array of objects
+let entries = lines.map(line => {
+	const parts = line.split('\t');
+	return {
+		original: line,
+		chinese: parts[0],
+		latin: parts[1],
+		number: parts[2] ? parseInt(parts[2], 10) : null
+	};
+});
+
+// Group entries by the Latin character sequence
+const groups = {};
+entries.forEach(entry => {
+	if (!groups[entry.latin]) {
+		groups[entry.latin] = [];
+	}
+	groups[entry.latin].push(entry);
+});
+
+// Process each group: sort by number (descending) and reassign ordinal numbers
+let sortedEntries = [];
+for (const key in groups) {
+	let group = groups[key];
+
+	// Separate entries with and without numbers
+	let withNumbers = group.filter(e => e.number !== null);
+	let withoutNumbers = group.filter(e => e.number === null);
+
+	// Sort by number in descending order
+	withNumbers.sort((a, b) => b.number - a.number);
+
+	// Assign ordinal rankings
+	for (let i = 0; i < withNumbers.length; i++) {
+		withNumbers[i].number = (withNumbers.length - i).toString();
+	}
+
+	// Preserve original order for entries without numbers
+	sortedEntries.push(...withNumbers, ...withoutNumbers);
+}
+
+printWords(sortedEntries);