added support for frequencies in dictionaries with transcriptions
This commit is contained in:
parent
795145fb2b
commit
51cd39fe27
3 changed files with 100 additions and 6 deletions
|
|
@ -1,6 +1,6 @@
|
|||
class Wrapper {
|
||||
static def getDictionaryLineData(String line, String delimiter) {
|
||||
String[] parts = line.split(delimiter, 2)
|
||||
String[] parts = line.split(delimiter, 3)
|
||||
String word = parts[0]
|
||||
String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : ""
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ const DELIMITER = ' ';
|
|||
|
||||
|
||||
function printHelp() {
|
||||
print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt`);
|
||||
print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt --transcribed`);
|
||||
print('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.');
|
||||
print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
|
||||
}
|
||||
|
|
@ -34,12 +34,13 @@ function validateInput() {
|
|||
return {
|
||||
locale: process.argv[2],
|
||||
dictionaryFileName: process.argv[3],
|
||||
transcribed: process.argv[5] !== undefined && process.argv[5] === '--transcribed',
|
||||
wordsWithFrequenciesFileName: process.argv[4]
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale }) {
|
||||
async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale, transcribed }) {
|
||||
// read the frequencies
|
||||
let lineReader = require('readline').createInterface({
|
||||
input: createReadStream(wordsWithFrequenciesFileName)
|
||||
|
|
@ -69,9 +70,17 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale
|
|||
|
||||
|
||||
const outputWords = [];
|
||||
for await (const word of lineReader) {
|
||||
const lowercaseWord = word.toLocaleLowerCase(locale);
|
||||
outputWords.push(`${word}${ (frequencies.get(lowercaseWord) || 0) > 0 ? DELIMITER + frequencies.get(lowercaseWord) : '' }`);
|
||||
for await (const line of lineReader) {
|
||||
let word = '';
|
||||
|
||||
if (transcribed) {
|
||||
const parts = line.split(DELIMITER);
|
||||
word = parts[0];
|
||||
} else {
|
||||
word = line.toLocaleLowerCase(locale);
|
||||
}
|
||||
|
||||
outputWords.push(`${line}${ (frequencies.get(word) || 0) > 0 ? DELIMITER + frequencies.get(word) : '' }`);
|
||||
}
|
||||
|
||||
return outputWords;
|
||||
|
|
|
|||
85
scripts/normalize-transcribed.js
Normal file
85
scripts/normalize-transcribed.js
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
const { basename } = require('path');
|
||||
const { existsSync, readFileSync } = require('fs');;
|
||||
const { print, printError } = require('./_printers.js')
|
||||
|
||||
|
||||
function printHelp() {
|
||||
print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
|
||||
print('Normalizes the frequencies in a dictionary with transcriptions.');
|
||||
}
|
||||
|
||||
|
||||
|
||||
function validateInput() {
|
||||
if (process.argv.length < 3) {
|
||||
printHelp();
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!existsSync(process.argv[2])) {
|
||||
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
return {
|
||||
fileName: process.argv[2]
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
function printWords(wordList) {
|
||||
if (Array.isArray(wordList)) {
|
||||
wordList.forEach(w => print(
|
||||
w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}`
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const { fileName } = validateInput();
|
||||
|
||||
const data = readFileSync(fileName, 'utf8');
|
||||
const lines = data.trim().split('\n');
|
||||
|
||||
// Parse the data into an array of objects
|
||||
let entries = lines.map(line => {
|
||||
const parts = line.split('\t');
|
||||
return {
|
||||
original: line,
|
||||
chinese: parts[0],
|
||||
latin: parts[1],
|
||||
number: parts[2] ? parseInt(parts[2], 10) : null
|
||||
};
|
||||
});
|
||||
|
||||
// Group entries by the Latin character sequence
|
||||
const groups = {};
|
||||
entries.forEach(entry => {
|
||||
if (!groups[entry.latin]) {
|
||||
groups[entry.latin] = [];
|
||||
}
|
||||
groups[entry.latin].push(entry);
|
||||
});
|
||||
|
||||
// Process each group: sort by number (descending) and reassign ordinal numbers
|
||||
let sortedEntries = [];
|
||||
for (const key in groups) {
|
||||
let group = groups[key];
|
||||
|
||||
// Separate entries with and without numbers
|
||||
let withNumbers = group.filter(e => e.number !== null);
|
||||
let withoutNumbers = group.filter(e => e.number === null);
|
||||
|
||||
// Sort by number in descending order
|
||||
withNumbers.sort((a, b) => b.number - a.number);
|
||||
|
||||
// Assign ordinal rankings
|
||||
for (let i = 0; i < withNumbers.length; i++) {
|
||||
withNumbers[i].number = (withNumbers.length - i).toString();
|
||||
}
|
||||
|
||||
// Preserve original order for entries without numbers
|
||||
sortedEntries.push(...withNumbers, ...withoutNumbers);
|
||||
}
|
||||
|
||||
printWords(sortedEntries);
|
||||
Loading…
Add table
Add a link
Reference in a new issue