a script for injesting words by unicode range
This commit is contained in:
parent
00e00e1802
commit
e9b832c46c
1 changed files with 111 additions and 0 deletions
111
scripts/injest-script-words.js
Normal file
111
scripts/injest-script-words.js
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
const { basename } = require('path');
|
||||||
|
const { createReadStream, existsSync } = require('fs');
|
||||||
|
const { createInterface } = require('readline');
|
||||||
|
const { print, printError } = require('./_printers.js');
|
||||||
|
|
||||||
|
|
||||||
|
function printHelp() {
|
||||||
|
print(`Usage ${basename(process.argv[1])} word-list.txt UnicodeRange [ExcludeRange]`);
|
||||||
|
print('Extracts words that belong to a given unicode range. Assumes all punctuation, whitespace and foreign characters as word separators.');
|
||||||
|
print('Example UnicodeRange: U+900-U+97F');
|
||||||
|
print('Example ExcludeRange: U+950-U+954U+964-U+971U+200CU+200D');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
function validateInput() {
|
||||||
|
if (process.argv.length < 3) {
|
||||||
|
printHelp();
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!existsSync(process.argv[2])) {
|
||||||
|
printError(`Failure! Could not find word list file "${process.argv[2]}".`);
|
||||||
|
process.exit(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!validateUnicodeRange(process.argv[3])) {
|
||||||
|
printError(`Failure! Invalid or missing search Unicode range(s): "${process.argv[3]}".`);
|
||||||
|
process.exit(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (process.argv[4] && !validateUnicodeRange(process.argv[4])) {
|
||||||
|
printError(`Failure! Invalid exclude range(s): "${process.argv[4]}".`);
|
||||||
|
process.exit(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
fileName: process.argv[2],
|
||||||
|
searchRegexString: process.argv[3],
|
||||||
|
excludeRegexString: typeof process.argv[4] === 'string' ? process.argv[4] : ''
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function validateUnicodeRange(inputRange) {
|
||||||
|
return /^([uU]\+[\da-fA-F]+)(\-*[uU]\+[\da-fA-F]+)*$/.test(inputRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function URangeToXRange(range) {
|
||||||
|
if (range.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return range
|
||||||
|
.toUpperCase()
|
||||||
|
.replaceAll(/U\+([\dA-F]+)/g, "\\u{$1}");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function printWords(wordList) {
|
||||||
|
if (Array.isArray(wordList)) {
|
||||||
|
wordList.forEach(w => print(w));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function cleanInvalidChars(line, searchRegex, excludeRegex) {
|
||||||
|
const spacesOnly = /^\s+$/;
|
||||||
|
|
||||||
|
if (!line || !line.length || spacesOnly.test(line)) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const cleanLine = excludeRegex !== null ? line.replace(excludeRegex, ' ') : line;
|
||||||
|
return cleanLine
|
||||||
|
.replace(searchRegex, ' ')
|
||||||
|
.split(' ')
|
||||||
|
.filter(w => w.length > 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async function readWords(fileName, searchRegex, excludeRegex) {
|
||||||
|
const words = new Set();
|
||||||
|
|
||||||
|
if (!fileName) {
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
for await (const line of createInterface({ input: createReadStream(fileName) })) {
|
||||||
|
cleanInvalidChars(line, searchRegex, excludeRegex).forEach(w => words.add(w));
|
||||||
|
}
|
||||||
|
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async function work({ fileName, searchRegexString, excludeRegexString }) {
|
||||||
|
const searchRegex = new RegExp("[^" + URangeToXRange(searchRegexString) + "]+", "gu");
|
||||||
|
const excludeRegex = excludeRegexString.length > 0 ? new RegExp("[" + URangeToXRange(excludeRegexString) + "]+", "gu") : null;
|
||||||
|
|
||||||
|
const words = Array.from(await readWords(fileName, searchRegex, excludeRegex));
|
||||||
|
return words.filter(word => word.length > 1).sort();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/** main **/
|
||||||
|
work(validateInput())
|
||||||
|
.then(words => printWords(words))
|
||||||
|
.catch(e => printError(e));
|
||||||
Loading…
Add table
Add a link
Reference in a new issue