"""Train a model using the default parameters from Van Paridon & Thompson (2019).
Use of this module requires working binaries for fastText and word2vec on the path and plenty of RAM.
Please note that even on a very fast desktop, training could take hours or days.
"""
import argparse
import subprocess as sp
from .utensils import log_timer
import psutil
import logging
logging.basicConfig(format='[{levelname}] {message}', style='{', level=logging.INFO)
cpu_count = psutil.cpu_count(logical=False) # logical=False to count only physical cores
[docs]@log_timer
def train_fasttext(training_data, prefix, lang, d=300, neg=10, epoch=10, t=.0001):
"""Train a fastText model on a given training corpus.
Requires a working fastText binary on the path.
:param training_data: text file containing the training corpus
:param prefix: prefix to use for the model binary and vector filenames
:param lang: language tag to use for the model binary and vector filenames
:param d: number of dimensions in the vector (default is 300)
:param neg: number of negative samples (fastText default is 5, subs2vec default used here is 10)
:param epoch: number of training epochs (fastText default is 5, subs2vec default used here is 10)
:param t: sampling threshold (default is .0001)
:return: tuple of filenames for model binary and vectors
"""
model_name = f'{prefix}.{lang}'
binary = ['fasttext']
method = ['skipgram']
train = ['-input', training_data]
output = ['-output', model_name]
neg = ['-neg', str(neg)]
epoch = ['-epoch', str(epoch)]
t = ['-t', str(t)]
d = ['-dim', str(d)]
thread = ['-thread', str(cpu_count)]
if logging.getLogger().isEnabledFor(logging.INFO):
sp.run(binary + method + train + output + neg + epoch + t + d + thread)
else:
sp.run(binary + method + train + output + neg + epoch + t + d + thread, stdout=sp.DEVNULL)
model = f'{model_name}.bin'
vecs = f'{model_name}.vec'
return model, vecs
[docs]@log_timer
def build_phrases(training_data, phrase_pass=5):
"""Use word2phrase to connect common phrases.
Uses the word2phrase tool to connect high mutual information phrases such as "New York" using underscores, so fastText will treat them as a single lexical item.
Requires a working word2phrase (included in word2vec) binary on the path.
:param training_data: text file containing the training corpus
:param phrase_pass: number of passes to do over the training file (default is 5)
:return: filename of the phrase-joined corpus
"""
base_fname = training_data.replace('.txt', '')
for i in range(phrase_pass):
t = (2 ** (phrase_pass - i - 1)) * 100
out_fname = f'{base_fname}.{i + 1}pass.d5.t{t}.txt'
binary = ['word2phrase']
train = ['-train', training_data]
output = ['-output', out_fname]
d = ['-min-count', str(5)]
t = ['-threshold', str(t)]
if logging.getLogger().isEnabledFor(logging.INFO):
sp.run(binary + train + output + d + t)
else:
sp.run(binary + train + output + d + t, stdout=sp.DEVNULL)
training_data = out_fname
return out_fname
[docs]def fix_encoding(training_data):
"""Fix utf-8 file encoding after word2phrase mangles it (happens sometimes).
:param training_data: file containing text with encoding that needs fixing
:return: filename of repaired text file
"""
out_fname = training_data.replace('.txt', '.utf-8.txt')
with open(training_data, 'r', encoding='utf-8', errors='ignore') as in_file, open(out_fname, 'w', encoding='utf-8') as out_file:
for line in in_file:
out_file.write(line)
return out_fname
[docs]def lowercase(training_data):
"""Cast a text file to lower case.
:param training_data: file containing text to cast to lower case
:return: filename of lower cased text file
"""
out_fname = training_data.replace('.txt', '.lower.txt')
with open(training_data, 'r', encoding='utf-8') as in_file, open(out_fname, 'w', encoding='utf-8') as out_file:
for line in in_file:
out_file.write(line.lower())
return out_fname
[docs]@log_timer
def generate(lang, prefix, training_data, lowercase=False):
"""Generate a fastText model using default parameters from Van Paridon & Thompson (2019).
:param lang: language tag to use for the model binary and vector filenames
:param training_data: text file containing the training corpus
:param prefix: prefix to use for the model binary and vector filenames
:param lowercase: boolean setting whether to cast training corpus to lower case (default is `False`)
"""
if lowercase:
logging.info(f'lowercasing {training_data}')
training_data = lowercase(training_data)
# build phrases
logging.info('building phrases for {}'.format(training_data))
training_data = build_phrases(training_data)
# fix potential broken utf-8 encoding
logging.info('checking (and fixing) utf-8 encoding for {}'.format(training_data))
training_data = fix_encoding(training_data)
# train fastText model
logging.info('training fastText model on {}'.format(training_data))
results = train_fasttext(training_data=training_data, lang=lang, prefix=prefix)
model, vecs = results
logging.info('model binary at {}'.format(model))
logging.info('word vectors at {}'.format(vecs))
if __name__ == '__main__':
argparser = argparse.ArgumentParser(description='generate a fastText model from OpenSubtitles and Wikipedia data')
argparser.add_argument('lang',
help='source language (OpenSubtitles and Wikipedia data uses ISO 639-1 codes)')
argparser.add_argument('prefix',
help='source data, use one of {wiki, sub, wiki-sub} if using automatic data preparation')
argparser.add_argument('training_data',
help='filename of text file containing the training corpus')
args = argparser.parse_args()
generate(**vars(args))