added support for frequencies in dictionaries with transcriptions
This commit is contained in:
parent
795145fb2b
commit
51cd39fe27
3 changed files with 100 additions and 6 deletions
|
|
@ -1,6 +1,6 @@
|
||||||
class Wrapper {
|
class Wrapper {
|
||||||
static def getDictionaryLineData(String line, String delimiter) {
|
static def getDictionaryLineData(String line, String delimiter) {
|
||||||
String[] parts = line.split(delimiter, 2)
|
String[] parts = line.split(delimiter, 3)
|
||||||
String word = parts[0]
|
String word = parts[0]
|
||||||
String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : ""
|
String transcription = parts.length > 1 && parts[1] =~ "^[a-zA-Z]+\$" ? parts[1] : ""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ const DELIMITER = ' ';
|
||||||
|
|
||||||
|
|
||||||
function printHelp() {
|
function printHelp() {
|
||||||
print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt`);
|
print(`Usage ${basename(process.argv[1])} LOCALE DICTIONARY-FILE-NAME.txt WORDS-WITH-FREQUENCIES.txt --transcribed`);
|
||||||
print('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.');
|
print('Matches up the words from DICTIONARY-FILE-NAME with the frequencies in WORDS-WITH-FREQUENCIES file.');
|
||||||
print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
|
print('LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...');
|
||||||
}
|
}
|
||||||
|
|
@ -34,12 +34,13 @@ function validateInput() {
|
||||||
return {
|
return {
|
||||||
locale: process.argv[2],
|
locale: process.argv[2],
|
||||||
dictionaryFileName: process.argv[3],
|
dictionaryFileName: process.argv[3],
|
||||||
|
transcribed: process.argv[5] !== undefined && process.argv[5] === '--transcribed',
|
||||||
wordsWithFrequenciesFileName: process.argv[4]
|
wordsWithFrequenciesFileName: process.argv[4]
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale }) {
|
async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale, transcribed }) {
|
||||||
// read the frequencies
|
// read the frequencies
|
||||||
let lineReader = require('readline').createInterface({
|
let lineReader = require('readline').createInterface({
|
||||||
input: createReadStream(wordsWithFrequenciesFileName)
|
input: createReadStream(wordsWithFrequenciesFileName)
|
||||||
|
|
@ -69,9 +70,17 @@ async function inject({ wordsWithFrequenciesFileName, dictionaryFileName, locale
|
||||||
|
|
||||||
|
|
||||||
const outputWords = [];
|
const outputWords = [];
|
||||||
for await (const word of lineReader) {
|
for await (const line of lineReader) {
|
||||||
const lowercaseWord = word.toLocaleLowerCase(locale);
|
let word = '';
|
||||||
outputWords.push(`${word}${ (frequencies.get(lowercaseWord) || 0) > 0 ? DELIMITER + frequencies.get(lowercaseWord) : '' }`);
|
|
||||||
|
if (transcribed) {
|
||||||
|
const parts = line.split(DELIMITER);
|
||||||
|
word = parts[0];
|
||||||
|
} else {
|
||||||
|
word = line.toLocaleLowerCase(locale);
|
||||||
|
}
|
||||||
|
|
||||||
|
outputWords.push(`${line}${ (frequencies.get(word) || 0) > 0 ? DELIMITER + frequencies.get(word) : '' }`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return outputWords;
|
return outputWords;
|
||||||
|
|
|
||||||
85
scripts/normalize-transcribed.js
Normal file
85
scripts/normalize-transcribed.js
Normal file
|
|
@ -0,0 +1,85 @@
|
||||||
|
const { basename } = require('path');
|
||||||
|
const { existsSync, readFileSync } = require('fs');;
|
||||||
|
const { print, printError } = require('./_printers.js')
|
||||||
|
|
||||||
|
|
||||||
|
function printHelp() {
|
||||||
|
print(`Usage ${basename(process.argv[1])} WORD-LIST.txt`);
|
||||||
|
print('Normalizes the frequencies in a dictionary with transcriptions.');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
function validateInput() {
|
||||||
|
if (process.argv.length < 3) {
|
||||||
|
printHelp();
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!existsSync(process.argv[2])) {
|
||||||
|
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
|
||||||
|
process.exit(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
fileName: process.argv[2]
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function printWords(wordList) {
|
||||||
|
if (Array.isArray(wordList)) {
|
||||||
|
wordList.forEach(w => print(
|
||||||
|
w.number !== null ? `${w.chinese}\t${w.latin}\t${w.number}` : `${w.chinese}\t${w.latin}`
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const { fileName } = validateInput();
|
||||||
|
|
||||||
|
const data = readFileSync(fileName, 'utf8');
|
||||||
|
const lines = data.trim().split('\n');
|
||||||
|
|
||||||
|
// Parse the data into an array of objects
|
||||||
|
let entries = lines.map(line => {
|
||||||
|
const parts = line.split('\t');
|
||||||
|
return {
|
||||||
|
original: line,
|
||||||
|
chinese: parts[0],
|
||||||
|
latin: parts[1],
|
||||||
|
number: parts[2] ? parseInt(parts[2], 10) : null
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Group entries by the Latin character sequence
|
||||||
|
const groups = {};
|
||||||
|
entries.forEach(entry => {
|
||||||
|
if (!groups[entry.latin]) {
|
||||||
|
groups[entry.latin] = [];
|
||||||
|
}
|
||||||
|
groups[entry.latin].push(entry);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Process each group: sort by number (descending) and reassign ordinal numbers
|
||||||
|
let sortedEntries = [];
|
||||||
|
for (const key in groups) {
|
||||||
|
let group = groups[key];
|
||||||
|
|
||||||
|
// Separate entries with and without numbers
|
||||||
|
let withNumbers = group.filter(e => e.number !== null);
|
||||||
|
let withoutNumbers = group.filter(e => e.number === null);
|
||||||
|
|
||||||
|
// Sort by number in descending order
|
||||||
|
withNumbers.sort((a, b) => b.number - a.number);
|
||||||
|
|
||||||
|
// Assign ordinal rankings
|
||||||
|
for (let i = 0; i < withNumbers.length; i++) {
|
||||||
|
withNumbers[i].number = (withNumbers.length - i).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Preserve original order for entries without numbers
|
||||||
|
sortedEntries.push(...withNumbers, ...withoutNumbers);
|
||||||
|
}
|
||||||
|
|
||||||
|
printWords(sortedEntries);
|
||||||
Loading…
Add table
Add a link
Reference in a new issue