added support for preventing dashed words to be broken in the word injestion scripts

2024-01-08 15:12:03 +02:00 · 2024-01-08 15:12:03 +02:00 · e58af0d45f
commit e58af0d45f
parent 4b7cef763a
2 changed files with 38 additions and 23 deletions
--- a/scripts/add-new-words.sh
+++ b/scripts/add-new-words.sh
@ -1,8 +1,8 @@
 #!/bin/bash

 if [ $# -lt 4 ]; then
-	echo "Usage: $0 LOCALE base-dictionary-file.csv new-words-file.txt frequency-file.csv"
-	echo 'Cleans up and adds new words to a dictionary file.'
+	echo "Usage: $0 LOCALE base-dictionary-file.csv new-words-file.txt frequency-file.csv [ignore-split-list.txt]"
+	echo 'Cleans up and adds new words to a dictionary file. Optionally, it could skip splitting the words from "ignore-split-list.txt"'
 	echo 'LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...'
 	exit 1
 fi
@ -22,15 +22,17 @@ if ! [[ -f $4 ]]; then
 	exit 2
 fi

+
 LOCALE=$1
 DICTIONARY_FILE=$2
 NEW_WORDS_FILE=$3
 FREQUENCY_FILE=$4
+IGNORE_SPLIT_LIST_FILE=$5
 WORK_DIR="/tmp/TT9_$(uuidgen)"

 mkdir -p $WORK_DIR && \
 sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > $WORK_DIR/_TT9_base.txt \
-	&& node scripts/injest-words.js $NEW_WORDS_FILE > $WORK_DIR/_TT9_1.txt \
+	&& node scripts/injest-words.js $NEW_WORDS_FILE $IGNORE_SPLIT_LIST_FILE > $WORK_DIR/_TT9_1.txt \
 	&& node scripts/remove-foreign-words.js $LOCALE $WORK_DIR/_TT9_1.txt $LOCALE $WORK_DIR/_TT9_base.txt > $WORK_DIR/_TT9_2.txt \
 	&& cp $WORK_DIR/_TT9_base.txt $WORK_DIR/_TT9_combined.txt \
 	&& echo >> $WORK_DIR/_TT9_combined.txt \
--- a/scripts/injest-words.js
+++ b/scripts/injest-words.js
@ -4,8 +4,9 @@ const { createInterface } = require('readline');


 function printHelp() {
-	console.log(`Usage ${basename(process.argv[1])} word-list.txt`);
+	console.log(`Usage ${basename(process.argv[1])} word-list.txt [split-ignore-list.txt]`);
 	console.log('Breaks dashed words into separate words, puts multiple words on a line on new lines and deletes repeating new lines.');
+	console.log('The split-ignore-list is optional. Allows for not splitting certain words by dashes.');
 }


@ -21,9 +22,15 @@ function validateInput() {
 		process.exit(2);
 	}

-	return {
-		fileName: process.argv[2]
+	if (process.argv[3] && !existsSync(process.argv[3])) {
+		console.error(`Failure! Could not ignore list file "${process.argv[3]}."`);
+		process.exit(2);
 	}
+
+	return {
+		fileName: process.argv[2],
+		ignoreListFileName: process.argv[3]
+	};
 }


@ -49,15 +56,21 @@ function cleanSpecialChars(line) {
 }


-function splitDashedWords(inputWords) {
-	if (!Array.isArray(inputWords)) {
+function splitDashedWords(inputWords, ignoreList) {
+	const ignoreWords = ignoreList instanceof Set ? ignoreList : new Set();
+	if (!(inputWords instanceof Set)) {
 		return [];
 	}

 	const dashedRoots = new Set();
 	const repeatingDashedRoots = new Set();
+	const outputWords = new Set();

 	for (const word of inputWords) {
+		if (ignoreWords.has(word)) {
+			continue;
+		}
+
 		const [root, ...others] = word.split('-');
 		if (root === undefined || others.length != 1) {
 			continue;
@ -70,8 +83,6 @@ function splitDashedWords(inputWords) {
 		}
 	}

-	const outputWords = new Set();
-
 	for (const word of inputWords) {
 		const [root, ...others] = word.split('-');
 		if (root && others.length === 1 && repeatingDashedRoots.has(root)) {
@ -86,22 +97,24 @@ function splitDashedWords(inputWords) {
 }


+async function readWords(fileName) {
+	const words = new Set();

-async function work({ fileName }) {
-	const wordsSet = new Set();
-
-	const lineReader = createInterface({ input: createReadStream(fileName) });
-
-	for await (const line of lineReader) {
-		const newWords = cleanSpecialChars(line);
-
-		for (let i = 0; i < newWords.length; i++) {
-			wordsSet.add(newWords[i]);
-		}
+	if (!fileName) {
+		return words;
 	}

-	const wordsArray = Array.from(wordsSet);
-	const splitWords = splitDashedWords(wordsArray);
+	for await (const line of createInterface({ input: createReadStream(fileName) })) {
+		cleanSpecialChars(line).forEach(w => words.add(w));
+	}
+
+	return words;
+}
+
+
+async function work({ fileName, ignoreListFileName }) {
+	const [ words, ignoreList ] = await Promise.all([ readWords(fileName), readWords(ignoreListFileName) ]);
+	const splitWords = splitDashedWords(words, ignoreList);
 	const filteredAndSortedWords = splitWords.filter(word => word.length > 1).sort();

 	return filteredAndSortedWords;