Source code for subs2vec.download

"""Download datasets for training subs2vec models (not Windows-compatible)."""
import subprocess
import pandas as pd
import argparse
import os
path = os.path.dirname(__file__)


[docs]def download_corpus(lang, source): """Convenient method for downloading corpora. Uses the subprocess module and curl to download corpora. Not Windows-compatible. :param lang: language to download (use 2-letter ISO code) :param source: corpus to download, (options are "subs" or "wiki") """ urls = { 'subs': f'http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/raw/{lang}.zip', 'wiki': f'http://dumps.wikimedia.your.org/{lang}wiki/latest/{lang}wiki-latest-pages-meta-current.xml.bz2' } subprocess.run(['curl', '-OLk', urls[source]])
[docs]def download_vecs(lang, source, binaries=False): """Convenient method for downloading vectors. Uses the subprocess module and curl to download corpora. Not Windows-compatible. :param lang: language to download (use 2-letter ISO code) :param source: corpus to download, (options are "subs", "wiki", or "wiki+subs") :param binaries: download binaries instead of vecs, boolean (set to False by default) """ df = pd.read_csv(os.path.join(path, 'paper_results', 'corpus_data.tsv'), sep='\t') if binaries: filetype = 'binary' # look up model binary url in df else: filetype = 'topvecs' # look up url for top 1M vecs in df url = df.loc[(df['lang'] == lang) & (df['vecs'] == source), filetype].values[0] fname = f'{source.replace("+", "-")}.{lang}.1e6.zip' subprocess.run(['curl', '-Lo', fname, f'{url}@download'])
if __name__ == '__main__': argparser = argparse.ArgumentParser(description='download datasets for training subs2vec models (not Windows-compatible)') argparser.add_argument('lang', help='language to download (use 2-letter ISO code)') argparser.add_argument('source', choices=['subs', 'wiki', 'wiki+subs'], help='what source to download from') argparser.add_argument('--vecs', action='store_true', help='download vectors') argparser.add_argument('--binaries', action='store_true', help='download binaries') argparser.add_argument('--corpus', action='store_true', help='download corpus') args = argparser.parse_args() if args.vecs: download_vecs(args.lang, args.source) if args.binaries: download_vecs(args.lang, args.source, binaries=args.binaries) if args.corpus: download_corpus(args.lang, args.source)