1
0
Fork 0

added support for frequencies in dictionaries with transcriptions

This commit is contained in:
sspanak 2025-03-18 19:19:57 +02:00 committed by Dimo Karaivanov
parent 795145fb2b
commit 51cd39fe27
3 changed files with 100 additions and 6 deletions

View file

@ -1,6 +1,6 @@
class Wrapper { class Wrapper {
static def getDictionaryLineData(String line, String delimiter) { static def getDictionaryLineData(String line, String delimiter) {
String[] parts = line.split(delimiter, 2) String[] parts = line.split(delimiter, 3)
String word = parts[0] String word = parts[0]
String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : "" String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : ""

View file

@ -7,7 +7,7 @@ const DELIMITER = ' ';
function printHelp() { function printHelp() {
print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt`); print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt --transcribed`);
print('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.'); print('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.');
print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...'); print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
} }
@ -34,12 +34,13 @@ function validateInput() {
return { return {
locale: process.argv[2], locale: process.argv[2],
dictionaryFileName: process.argv[3], dictionaryFileName: process.argv[3],
transcribed: process.argv[5] !== undefined && process.argv[5] === '--transcribed',
wordsWithFrequenciesFileName: process.argv[4] wordsWithFrequenciesFileName: process.argv[4]
}; };
} }
async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale }) { async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale, transcribed }) {
// read the frequencies // read the frequencies
let lineReader = require('readline').createInterface({ let lineReader = require('readline').createInterface({
input: createReadStream(wordsWithFrequenciesFileName) input: createReadStream(wordsWithFrequenciesFileName)
@ -69,9 +70,17 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale
const outputWords = []; const outputWords = [];
for await (const word of lineReader) { for await (const line of lineReader) {
const lowercaseWord = word.toLocaleLowerCase(locale); let word = '';
outputWords.push(`${word}${ (frequencies.get(lowercaseWord) || 0) > 0 ? DELIMITER + frequencies.get(lowercaseWord) : '' }`);
if (transcribed) {
const parts = line.split(DELIMITER);
word = parts[0];
} else {
word = line.toLocaleLowerCase(locale);
}
outputWords.push(`${line}${ (frequencies.get(word) || 0) > 0 ? DELIMITER + frequencies.get(word) : '' }`);
} }
return outputWords; return outputWords;

View file

@ -0,0 +1,85 @@
const { basename } = require('path');
const { existsSync, readFileSync } = require('fs');;
const { print, printError } = require('./_printers.js')
function printHelp() {
print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
print('Normalizes the frequencies in a dictionary with transcriptions.');
}
function validateInput() {
if (process.argv.length < 3) {
printHelp();
process.exit(1);
}
if (!existsSync(process.argv[2])) {
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
process.exit(2);
}
return {
fileName: process.argv[2]
};
}
function printWords(wordList) {
if (Array.isArray(wordList)) {
wordList.forEach(w => print(
w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}`
));
}
}
const { fileName } = validateInput();
const data = readFileSync(fileName, 'utf8');
const lines = data.trim().split('\n');
// Parse the data into an array of objects
let entries = lines.map(line => {
const parts = line.split('\t');
return {
original: line,
chinese: parts[0],
latin: parts[1],
number: parts[2] ? parseInt(parts[2], 10) : null
};
});
// Group entries by the Latin character sequence
const groups = {};
entries.forEach(entry => {
if (!groups[entry.latin]) {
groups[entry.latin] = [];
}
groups[entry.latin].push(entry);
});
// Process each group: sort by number (descending) and reassign ordinal numbers
let sortedEntries = [];
for (const key in groups) {
let group = groups[key];
// Separate entries with and without numbers
let withNumbers = group.filter(e => e.number !== null);
let withoutNumbers = group.filter(e => e.number === null);
// Sort by number in descending order
withNumbers.sort((a, b) => b.number - a.number);
// Assign ordinal rankings
for (let i = 0; i < withNumbers.length; i++) {
withNumbers[i].number = (withNumbers.length - i).toString();
}
// Preserve original order for entries without numbers
sortedEntries.push(...withNumbers, ...withoutNumbers);
}
printWords(sortedEntries);