new word processing scripts
* a script for injesting words by unicode range * Devanagari validation script
This commit is contained in:
parent
e9b832c46c
commit
622a954633
2 changed files with 245 additions and 0 deletions
149
scripts/extract-frequencies-from-text.js
Normal file
149
scripts/extract-frequencies-from-text.js
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
const { basename } = require('path');
|
||||
const { createReadStream, existsSync } = require('fs');
|
||||
const { createInterface } = require('readline');
|
||||
const { print, printError } = require('./_printers.js');
|
||||
|
||||
|
||||
function printHelp() {
|
||||
print(`Usage ${basename(process.argv[1])} text.txt UnicodeRange [ExcludeRange] [EraseRange]`);
|
||||
print('From the given text file, extracts and counts the unique words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
|
||||
print('Example UnicodeRange: U+900-U+97FU+200CU+200D');
|
||||
print('Example ExcludeRange: U+950-U+957U+966-U+97F');
|
||||
print('Example EraseRange: U+964U+965U+970U+971');
|
||||
}
|
||||
|
||||
|
||||
|
||||
function validateInput() {
|
||||
if (process.argv.length < 3) {
|
||||
printHelp();
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!existsSync(process.argv[2])) {
|
||||
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
if (!validateUnicodeRange(process.argv[3])) {
|
||||
printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
|
||||
printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
if (process.argv[5] && !validateUnicodeRange(process.argv[5])) {
|
||||
printError(`Failure! Invalid exclude range(s): "${process.argv[5]}".`);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
return {
|
||||
fileName: process.argv[2],
|
||||
searchRegexString: process.argv[3],
|
||||
excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : '',
|
||||
eraseRegexString: typeof process.argv[5] === 'string' ? process.argv[5] : ''
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
function validateUnicodeRange(inputRange) {
|
||||
return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
|
||||
}
|
||||
|
||||
|
||||
function URangeToXRange(range) {
|
||||
if (range.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return range
|
||||
.toUpperCase()
|
||||
.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
|
||||
}
|
||||
|
||||
|
||||
function printWords(wordList) {
|
||||
if (Array.isArray(wordList)) {
|
||||
wordList.forEach(w => print(`${w.w}\t${w.f}`));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function cleanInvalidChars(line, eraseRegex, excludeRegexString) {
|
||||
const spacesOnly = /^\s+$/;
|
||||
|
||||
if (!line || !line.length || spacesOnly.test(line)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const invalidWordRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]", "u") : null;
|
||||
|
||||
return line
|
||||
.replace(eraseRegex, ' ')
|
||||
.split(' ')
|
||||
.filter(w => w.length > 1 && (invalidWordRegex === null || !invalidWordRegex.test(w)));
|
||||
}
|
||||
|
||||
|
||||
async function readWords(fileName, eraseRegex, excludeRegexString) {
|
||||
const words = new Map();
|
||||
|
||||
if (!fileName) {
|
||||
return words;
|
||||
}
|
||||
|
||||
for await (const line of createInterface({ input: createReadStream(fileName) })) {
|
||||
const parts = cleanInvalidChars(line, eraseRegex, excludeRegexString);
|
||||
parts.forEach(w => {
|
||||
words.set(w, words.has(w) ? words.get(w) + 1 : 1);
|
||||
});
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
function sortWords(wordsMap) {
|
||||
const words = [];
|
||||
for (let [w, f] of wordsMap) {
|
||||
words.push({ w, f });
|
||||
}
|
||||
|
||||
return words.sort((a, b) => {
|
||||
if (a.f > b.f) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (a.f < b.f) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (a.w < b.w) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (a.w > b.w) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
async function work({ fileName, searchRegexString, excludeRegexString, eraseRegexString }) {
|
||||
const eraseRegex = new RegExp("([^" + URangeToXRange(searchRegexString) + " ]+|[" + URangeToXRange(eraseRegexString) + "])", "gu");
|
||||
return sortWords(
|
||||
await readWords(fileName, eraseRegex, excludeRegexString)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** main **/
|
||||
work(validateInput())
|
||||
.then(words => printWords(words))
|
||||
.catch(e => printError(e));
|
||||
Loading…
Add table
Add a link
Reference in a new issue