added support for frequencies in dictionaries with transcriptions

2025-03-18 19:19:57 +02:00 · 2025-03-18 19:19:57 +02:00 · 51cd39fe27
commit 51cd39fe27
parent 795145fb2b
3 changed files with 100 additions and 6 deletions
--- a/app/dictionary-tools.gradle
+++ b/app/dictionary-tools.gradle
@ -1,6 +1,6 @@
 class Wrapper {
 	static def getDictionaryLineData(String line, String delimiter) {
-		String[] parts = line.split(delimiter, 2)
+		String[] parts = line.split(delimiter, 3)
 		String word = parts[0]
 		String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : ""
--- a/scripts/inject-dictionary-frequencies.js
+++ b/scripts/inject-dictionary-frequencies.js
@ -7,7 +7,7 @@ const DELIMITER = '	';
 function printHelp() {
-	print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt`);
+	print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt --transcribed`);
 	print('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.');
 	print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
 }
@ -34,12 +34,13 @@ function validateInput() {
 	return {
 		locale: process.argv[2],
 		dictionaryFileName: process.argv[3],
 		transcribed: process.argv[5] !== undefined && process.argv[5] === '--transcribed',
 		wordsWithFrequenciesFileName: process.argv[4]
 	};
 }
-async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale }) {
+async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale, transcribed }) {
 	// read the frequencies
 	let lineReader = require('readline').createInterface({
 	  input: createReadStream(wordsWithFrequenciesFileName)
@ -69,9 +70,17 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale
 	const outputWords = [];
-	for await (const word of lineReader) {
+	for await (const line of lineReader) {
-		const lowercaseWord = word.toLocaleLowerCase(locale);
+		let word = '';
-		outputWords.push(`${word}${ (frequencies.get(lowercaseWord) || 0) > 0 ? DELIMITER + frequencies.get(lowercaseWord) : '' }`);
+
 		if (transcribed) {
 			const parts = line.split(DELIMITER);
 			word = parts[0];
 		} else {
 			word = line.toLocaleLowerCase(locale);
 		}
 		outputWords.push(`${line}${ (frequencies.get(word) || 0) > 0 ? DELIMITER + frequencies.get(word) : '' }`);
 	}
 	return outputWords;
--- a/scripts/normalize-transcribed.js
+++ b/scripts/normalize-transcribed.js
@ -0,0 +1,85 @@
 const { basename } = require('path');
 const { existsSync, readFileSync } = require('fs');;
 const { print, printError } = require('./_printers.js')
 function printHelp() {
 	print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
 	print('Normalizes the frequencies in a dictionary with transcriptions.');
 }
 function validateInput() {
 	if (process.argv.length < 3) {
 		printHelp();
 		process.exit(1);
 	}
 	if (!existsSync(process.argv[2])) {
 		printError(`Failure! Could not find word list file "${process.argv[2]}".`);
 		process.exit(2);
 	}
 	return {
 		fileName: process.argv[2]
 	};
 }
 function printWords(wordList) {
 	if (Array.isArray(wordList)) {
 		wordList.forEach(w => print(
 			w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}`
 		));
 	}
 }
 const { fileName } = validateInput();
 const data = readFileSync(fileName, 'utf8');
 const lines = data.trim().split('\n');
 // Parse the data into an array of objects
 let entries = lines.map(line => {
 	const parts = line.split('\t');
 	return {
 		original: line,
 		chinese: parts[0],
 		latin: parts[1],
 		number: parts[2] ? parseInt(parts[2], 10) : null
 	};
 });
 // Group entries by the Latin character sequence
 const groups = {};
 entries.forEach(entry => {
 	if (!groups[entry.latin]) {
 		groups[entry.latin] = [];
 	}
 	groups[entry.latin].push(entry);
 });
 // Process each group: sort by number (descending) and reassign ordinal numbers
 let sortedEntries = [];
 for (const key in groups) {
 	let group = groups[key];
 	// Separate entries with and without numbers
 	let withNumbers = group.filter(e => e.number !== null);
 	let withoutNumbers = group.filter(e => e.number === null);
 	// Sort by number in descending order
 	withNumbers.sort((a, b) => b.number - a.number);
 	// Assign ordinal rankings
 	for (let i = 0; i < withNumbers.length; i++) {
 		withNumbers[i].number = (withNumbers.length - i).toString();
 	}
 	// Preserve original order for entries without numbers
 	sortedEntries.push(...withNumbers, ...withoutNumbers);
 }
 printWords(sortedEntries);