Source code for subs2vec.clean_wiki

"""Clean Wikipedia dumps for use as a training corpus."""
import re
import argparse
import html
import bz2
import logging
from .utensils import log_timer
from multiprocessing import cpu_count
logging.basicConfig(format='[{levelname}] {message}', style='{', level=logging.INFO)
cores = int(cpu_count() / 2)


[docs]@log_timer
def strip_wiki_file(fname):
    """Strip xml and other tags from Wikipedia dump.

    Writes stripped Wikipedia text directly to text file.

    :param fname: Wikipedia dump file, in xml or bzip2 format.
    """
    logging.info(f'stripping {fname}')
    if fname.endswith('.bz2'):
        with bz2.open(fname, 'rt', encoding='utf-8') as in_file, open(fname.replace('.xml.bz2', '.txt'), 'w', encoding='utf-8') as out_file:
            out_file.write(_strip_wiki_xml(in_file.read()))
    if fname.endswith('.xml'):
        with open(fname, 'r', encoding='utf-8') as in_file, open(fname.replace('.xml', '.txt'), 'w', encoding='utf-8') as out_file:
            out_file.write(_strip_wiki_xml(in_file.read()))
    logging.info(f'completed stripping {fname}')


[docs]@log_timer
def big_strip_wiki_file(fname, lines_per_chunk=1e6):
    """Strip xml and other tags from a Wikipedia dump that doesn't fit into RAM.

    Processes Wikipedia dump in chunks and then concatenates the junks into a single text file.

    :param fname: Wikipedia dump file, in xml or bzip2 format.
    :param lines_per_chunk: number of lines in each chunk (default is 1e6, one million lines)
    """
    logging.info(f'stripping {fname}')
    if fname.endswith('.bz2'):
        with bz2.open(fname, 'rt', encoding='utf-8') as in_file, open(fname.replace('.xml.bz2', '.txt'), 'w', encoding='utf-8') as out_file:

            i = 0
            j = 0
            lines = []
            text = False
            for line in in_file:
                if '<text' in line:
                    lines.append(line)
                    text = True
                elif '</text' in line:
                    lines.append(line)
                    text = False
                    if i > ((j + 1) * int(lines_per_chunk)):
                        out_file.write(_strip_wiki_xml(''.join(lines)))
                        lines = []
                        j += 1
                else:
                    if text:
                        lines.append(line)
                        i += 1
            out_file.write(_strip_wiki_xml(''.join(lines)))

    if fname.endswith('.xml'):
        with open(fname, 'r', encoding='utf-8') as in_file, open(fname.replace('.xml', '.txt'), 'w', encoding='utf-8') as out_file:
            out_file.write(_strip_wiki_xml(in_file.read()))
    logging.info(f'completed stripping {fname}')


def _strip_curly(txt):
    curly = 0
    txt = list(txt)
    for i in range(len(txt)):
        if txt[i] == '{':
            curly += 1
        elif txt[i] == '}':
            curly -= 1
            txt[i] = ''
        if curly > 0:
            txt[i] = ''
        elif curly < 0:
            # there appear to be more closing than opening brackets in this lemma, we should just discard it
            txt = []
            break
    return ''.join(txt)


regeces = [
    (r'(?s)<ref.*?</ref>', ''),  # strip reference links
    (r'(?s)<references.*?</references>', ''),  # strip references
    (r'(?s)<table.*?</table>', ''),  # strip tables
    (r'(?s)<gallery.*?</gallery>', ''),  # strip galleries
    (r'(?s)<kml.*?</kml>', ''),  # strip KML tags
    (r'<.*?>', ''),  # strip other xml tags
    (r'http.*?(?:[\s\n\]]|$)', ''),  # strip external http(s) links
    (r'\[\[[^\]]*?:.*\|(.*?)\]\]', '\\1'),  # strip links to files, etc. but keep labels
    (r'\[\[[^\]]*?:(.*?)\]\]', ''),  # strip category links
    (r'\[\[[^\]]*?\|(.*?)\]\]', '\\1'),  # convert labeled links to just labels
    (r'(?m)^[\s]*[!?*;:=+\-|#_].*?$', ''),  # strip lines that do not start with alphanumerics, quotes, or brackets
    (r'(?m)^.*?\(UTC\).*?$', ''),  # strip lines containing a time stamp
    (r'\s\(.*?\)', ''),  # remove everything in parentheses
    (r'([^\s.!?:;]{2})[.!?:;]+?[\s\n]|$', '\\1\n'),  # break sentences at periods
    (r"[-–—/']", ' '),  # replace hyphens, apostrophes and slashes with spaces
    (r'\s*\n\s*', '\n'),  # strip empty lines and lines containing whitespace
    (r'\s{2,}', ' '),  # strip excessive spaces
]
patterns = [(re.compile(regec[0], re.IGNORECASE), regec[1]) for regec in regeces]
def _strip_wiki_xml(txts):
    """Strip xml and other tags from Wikipedia text.

    :param txts: Wikipedia dump text containing multiple articles
    :return: stripped Wikipedia text
    """
    pattern = re.compile('<text.*?>(.*?)</text>', re.DOTALL)
    txts = pattern.findall(html.unescape(html.unescape(txts)))  # double unescape because Wikipedia dumps are a mess

    txts = [_strip_curly(txt) if ((not txt.startswith('#'))
                                 and ('<noinclude>' not in txt.lower())
                                 and ('__noindex__' not in txt.lower())
                                 ) else '' for txt in txts]

    for i in range(len(txts)):
        for pattern in patterns:
            txts[i] = pattern[0].sub(pattern[1], txts[i])

    txts = [''.join([letter for letter in txt if (letter.isalnum() or letter.isspace())]) for txt in txts if txt != '']
    return '\n'.join(txts)


if __name__ == '__main__':
    argparser = argparse.ArgumentParser(description='strip Wikipedia dumps of xml and other tags')
    argparser.add_argument('fname', help='name of Wikipedia dump file')
    argparser.add_argument('--big', action='store_true', help='use special method for files that do not fit in RAM')
    args = argparser.parse_args()

    if args.big:
        big_strip_wiki_file(fname=args.fname)
    else:
        strip_wiki_file(fname=args.fname)