Source code for subs2vec.clean_wiki

"""Clean Wikipedia dumps for use as a training corpus."""
import re
import argparse
import html
import bz2
import logging
from .utensils import log_timer
from multiprocessing import cpu_count
logging.basicConfig(format='[{levelname}] {message}', style='{', level=logging.INFO)
cores = int(cpu_count() / 2)


[docs]@log_timer def strip_wiki_file(fname): """Strip xml and other tags from Wikipedia dump. Writes stripped Wikipedia text directly to text file. :param fname: Wikipedia dump file, in xml or bzip2 format. """ logging.info(f'stripping {fname}') if fname.endswith('.bz2'): with bz2.open(fname, 'rt', encoding='utf-8') as in_file, open(fname.replace('.xml.bz2', '.txt'), 'w', encoding='utf-8') as out_file: out_file.write(_strip_wiki_xml(in_file.read())) if fname.endswith('.xml'): with open(fname, 'r', encoding='utf-8') as in_file, open(fname.replace('.xml', '.txt'), 'w', encoding='utf-8') as out_file: out_file.write(_strip_wiki_xml(in_file.read())) logging.info(f'completed stripping {fname}')
[docs]@log_timer def big_strip_wiki_file(fname, lines_per_chunk=1e6): """Strip xml and other tags from a Wikipedia dump that doesn't fit into RAM. Processes Wikipedia dump in chunks and then concatenates the junks into a single text file. :param fname: Wikipedia dump file, in xml or bzip2 format. :param lines_per_chunk: number of lines in each chunk (default is 1e6, one million lines) """ logging.info(f'stripping {fname}') if fname.endswith('.bz2'): with bz2.open(fname, 'rt', encoding='utf-8') as in_file, open(fname.replace('.xml.bz2', '.txt'), 'w', encoding='utf-8') as out_file: i = 0 j = 0 lines = [] text = False for line in in_file: if '<text' in line: lines.append(line) text = True elif '</text' in line: lines.append(line) text = False if i > ((j + 1) * int(lines_per_chunk)): out_file.write(_strip_wiki_xml(''.join(lines))) lines = [] j += 1 else: if text: lines.append(line) i += 1 out_file.write(_strip_wiki_xml(''.join(lines))) if fname.endswith('.xml'): with open(fname, 'r', encoding='utf-8') as in_file, open(fname.replace('.xml', '.txt'), 'w', encoding='utf-8') as out_file: out_file.write(_strip_wiki_xml(in_file.read())) logging.info(f'completed stripping {fname}')
def _strip_curly(txt): curly = 0 txt = list(txt) for i in range(len(txt)): if txt[i] == '{': curly += 1 elif txt[i] == '}': curly -= 1 txt[i] = '' if curly > 0: txt[i] = '' elif curly < 0: # there appear to be more closing than opening brackets in this lemma, we should just discard it txt = [] break return ''.join(txt) regeces = [ (r'(?s)<ref.*?</ref>', ''), # strip reference links (r'(?s)<references.*?</references>', ''), # strip references (r'(?s)<table.*?</table>', ''), # strip tables (r'(?s)<gallery.*?</gallery>', ''), # strip galleries (r'(?s)<kml.*?</kml>', ''), # strip KML tags (r'<.*?>', ''), # strip other xml tags (r'http.*?(?:[\s\n\]]|$)', ''), # strip external http(s) links (r'\[\[[^\]]*?:.*\|(.*?)\]\]', '\\1'), # strip links to files, etc. but keep labels (r'\[\[[^\]]*?:(.*?)\]\]', ''), # strip category links (r'\[\[[^\]]*?\|(.*?)\]\]', '\\1'), # convert labeled links to just labels (r'(?m)^[\s]*[!?*;:=+\-|#_].*?$', ''), # strip lines that do not start with alphanumerics, quotes, or brackets (r'(?m)^.*?\(UTC\).*?$', ''), # strip lines containing a time stamp (r'\s\(.*?\)', ''), # remove everything in parentheses (r'([^\s.!?:;]{2})[.!?:;]+?[\s\n]|$', '\\1\n'), # break sentences at periods (r"[-–—/']", ' '), # replace hyphens, apostrophes and slashes with spaces (r'\s*\n\s*', '\n'), # strip empty lines and lines containing whitespace (r'\s{2,}', ' '), # strip excessive spaces ] patterns = [(re.compile(regec[0], re.IGNORECASE), regec[1]) for regec in regeces] def _strip_wiki_xml(txts): """Strip xml and other tags from Wikipedia text. :param txts: Wikipedia dump text containing multiple articles :return: stripped Wikipedia text """ pattern = re.compile('<text.*?>(.*?)</text>', re.DOTALL) txts = pattern.findall(html.unescape(html.unescape(txts))) # double unescape because Wikipedia dumps are a mess txts = [_strip_curly(txt) if ((not txt.startswith('#')) and ('<noinclude>' not in txt.lower()) and ('__noindex__' not in txt.lower()) ) else '' for txt in txts] for i in range(len(txts)): for pattern in patterns: txts[i] = pattern[0].sub(pattern[1], txts[i]) txts = [''.join([letter for letter in txt if (letter.isalnum() or letter.isspace())]) for txt in txts if txt != ''] return '\n'.join(txts) if __name__ == '__main__': argparser = argparse.ArgumentParser(description='strip Wikipedia dumps of xml and other tags') argparser.add_argument('fname', help='name of Wikipedia dump file') argparser.add_argument('--big', action='store_true', help='use special method for files that do not fit in RAM') args = argparser.parse_args() if args.big: big_strip_wiki_file(fname=args.fname) else: strip_wiki_file(fname=args.fname)