Source code for subs2vec.download

"""Download datasets for training subs2vec models (not Windows-compatible)."""
import subprocess
import pandas as pd
import argparse
import os
path = os.path.dirname(__file__)


[docs]def download_corpus(lang, source):
    """Convenient method for downloading corpora.

    Uses the subprocess module and curl to download corpora.
    Not Windows-compatible.

    :param lang: language to download (use 2-letter ISO code)
    :param source: corpus to download, (options are "subs" or "wiki")
    """
    urls = {
        'subs': f'http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/raw/{lang}.zip',
        'wiki': f'http://dumps.wikimedia.your.org/{lang}wiki/latest/{lang}wiki-latest-pages-meta-current.xml.bz2'
    }
    subprocess.run(['curl', '-OLk', urls[source]])


[docs]def download_vecs(lang, source, binaries=False):
    """Convenient method for downloading vectors.

    Uses the subprocess module and curl to download corpora.
    Not Windows-compatible.

    :param lang: language to download (use 2-letter ISO code)
    :param source: corpus to download, (options are "subs", "wiki", or "wiki+subs")
    :param binaries: download binaries instead of vecs, boolean (set to False by default)
    """
    df = pd.read_csv(os.path.join(path, 'paper_results', 'corpus_data.tsv'), sep='\t')
    if binaries:
        filetype = 'binary'  # look up model binary url in df
    else:
        filetype = 'topvecs'  # look up url for top 1M vecs in df
    url = df.loc[(df['lang'] == lang) & (df['vecs'] == source), filetype].values[0]
    fname = f'{source.replace("+", "-")}.{lang}.1e6.zip'
    subprocess.run(['curl', '-Lo', fname, f'{url}@download'])


if __name__ == '__main__':
    argparser = argparse.ArgumentParser(description='download datasets for training subs2vec models (not Windows-compatible)')
    argparser.add_argument('lang', help='language to download (use 2-letter ISO code)')
    argparser.add_argument('source', choices=['subs', 'wiki', 'wiki+subs'], help='what source to download from')
    argparser.add_argument('--vecs', action='store_true', help='download vectors')
    argparser.add_argument('--binaries', action='store_true', help='download binaries')
    argparser.add_argument('--corpus', action='store_true', help='download corpus')
    args = argparser.parse_args()

    if args.vecs:
        download_vecs(args.lang, args.source)
    if args.binaries:
        download_vecs(args.lang, args.source, binaries=args.binaries)
    if args.corpus:
        download_corpus(args.lang, args.source)