From 2f448636e9e6f132cb5c9cfc92c6d4c1c59f5051 Mon Sep 17 00:00:00 2001 From: sspanak Date: Tue, 19 Dec 2023 13:02:16 +0200 Subject: [PATCH] scripts for cleaning up and adding new lists of words --- scripts/add-new-words.sh | 39 ++++++++++++++ scripts/injest-words.js | 107 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100755 scripts/add-new-words.sh create mode 100644 scripts/injest-words.js diff --git a/scripts/add-new-words.sh b/scripts/add-new-words.sh new file mode 100755 index 00000000..d86d5a4e --- /dev/null +++ b/scripts/add-new-words.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +if [ $# -lt 4 ]; then + echo "Usage: $0 LOCALE base-dictionary-file.csv new-words-file.txt frequency-file.csv" + echo 'Cleans up and adds new words to a dictionary file.' + echo 'LOCALE could be any valid JS locale, for exmaple: en, en-US, etc...' + exit 1 +fi + +if ! [[ -f $2 ]]; then + echo "base-dictionary-file: '$2' does not exist" + exit 2 +fi + +if ! [[ -f $3 ]]; then + echo "new-words-file: '$3' does not exist" + exit 2 +fi + +if ! [[ -f $4 ]]; then + echo "frequency-file: '$4' does not exist" + exit 2 +fi + +LOCALE=$1 +DICTIONARY_FILE=$2 +NEW_WORDS_FILE=$3 +FREQUENCY_FILE=$4 + +sed -E 's/[\t0-9]+//g' $DICTIONARY_FILE > /tmp/_TT9_base.txt \ + && node scripts/injest-words.js $NEW_WORDS_FILE > /tmp/_TT9_1.txt \ + && node scripts/remove-foreign-words.js $LOCALE /tmp/_TT9_1.txt $LOCALE /tmp/_TT9_base.txt > /tmp/_TT9_2.txt \ + && cp /tmp/_TT9_base.txt /tmp/_TT9_combined.txt \ + && cat /tmp/_TT9_2.txt >> /tmp/_TT9_combined.txt \ + && node scripts/remove-dictionary-repeating-words.js $LOCALE /tmp/_TT9_combined.txt > /tmp/_TT9_clean.txt \ + && node scripts/inject-dictionary-frequencies.js /tmp/_TT9_clean.txt $FREQUENCY_FILE $LOCALE > /tmp/_TT9_output.txt \ + && cat /tmp/_TT9_output.txt + +rm -f /tmp/_TT9* diff --git a/scripts/injest-words.js b/scripts/injest-words.js new file mode 100644 index 00000000..01c63e95 --- /dev/null +++ b/scripts/injest-words.js @@ -0,0 +1,107 @@ +const { basename } = require('path'); +const { createReadStream, existsSync } = require('fs'); +const { createInterface } = require('readline'); + + +function printHelp() { + console.log(`Usage ${basename(process.argv[1])} word-list.txt`); + console.log('Breaks dashed words into separate words, puts multiple words on a line on new lines and deletes repeating new lines.'); +} + + + +function validateInput() { + if (process.argv.length < 3) { + printHelp(); + process.exit(1); + } + + if (!existsSync(process.argv[2])) { + console.error(`Failure! Could not find word list file "${process.argv[3]}."`); + process.exit(2); + } + + return { + fileName: process.argv[2] + } +} + + +function printWords(wordList) { + if (Array.isArray(wordList)) { + wordList.forEach(w => console.log(w)); + } +} + + +function cleanSpecialChars(line) { + const spacesOnly = /^\s+$/; + const digits = /\d+/; + + if (!line || !line.length || spacesOnly.test(line)) { + return []; + } + + return line + .replaceAll(/[\x01-\x20]+/g, ' ') + .replaceAll(/[&\s,":;\*/]+/g, ' ') + .replaceAll(/\s+/g, ' ') + .replaceAll(/[\[\]\.\?\(\)]/g, '') + .split(' ') + .filter(w => w.length > 1 && !digits.test(w)); +} + + +function splitDashedWords(inputWords) { + if (!Array.isArray(inputWords)) { + return []; + } + + const roots = {}; + const words = {}; + + for (const word of inputWords) { + if (!word.includes('-')) { + words[word] = true; + continue; + } + + const parts = word.split('-'); + for (let i = 0; i < parts.length - 1; i++) { + const key = `${parts[i]}-`; + if (key in roots) { + words[key] = true; + } else { + roots[key] = true; + words[parts[i]] = true; + } + } + + words[parts[parts.length - 1]] = true; + } + + return Object.keys(words); +} + + +async function work({ fileName }) { + words = []; + + let lineReader = createInterface({ input: createReadStream(fileName) }); + for await (const line of lineReader) { + newWords = cleanSpecialChars(line); + + words = [ + ...words, + ...newWords + ]; + } + + return splitDashedWords(words).filter(w => w.length > 1).sort(); +} + + +/** main **/ +work(validateInput()) + .then(words => printWords(words)) + .catch(e => console.error(e));