Source code for gtts.tokenizer.core

# -*- coding: utf-8 -*-
import re


[docs]class RegexBuilder(): r"""Builds regex using arguments passed into a pattern template. Builds a regex object for which the pattern is made from an argument passed into a template. If more than one argument is passed (iterable), each pattern is joined by "|" (regex alternation 'or') to create a single pattern. Args: pattern_args (iteratable): String element(s) to be each passed to ``pattern_func`` to create a regex pattern. Each element is ``re.escape``'d before being passed. pattern_func (callable): A 'template' function that should take a string and return a string. It should take an element of ``pattern_args`` and return a valid regex pattern group string. flags: ``re`` flag(s) to compile with the regex. Example: To create a simple regex that matches on the characters "a", "b", or "c", followed by a period:: >>> rb = RegexBuilder('abc', lambda x: "{}\.".format(x)) Looking at ``rb.regex`` we get the following compiled regex:: >>> print(rb.regex) 'a\.|b\.|c\.' The above is fairly simple, but this class can help in writing more complex repetitive regex, making them more readable and easier to create by using existing data structures. Example: To match the character following the words "lorem", "ipsum", "meili" or "koda":: >>> words = ['lorem', 'ipsum', 'meili', 'koda'] >>> rb = RegexBuilder(words, lambda x: "(?<={}).".format(x)) Looking at ``rb.regex`` we get the following compiled regex:: >>> print(rb.regex) '(?<=lorem).|(?<=ipsum).|(?<=meili).|(?<=koda).' """ def __init__(self, pattern_args, pattern_func, flags=0): self.pattern_args = pattern_args self.pattern_func = pattern_func self.flags = flags # Compile self.regex = self._compile() def _compile(self): alts = [] for arg in self.pattern_args: arg = re.escape(arg) alt = self.pattern_func(arg) alts.append(alt) pattern = '|'.join(alts) return re.compile(pattern, self.flags) def __repr__(self): # pragma: no cover return str(self.regex)
[docs]class PreProcessorRegex(): r"""Regex-based substitution text pre-processor. Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a :class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl`` replacement parameter. Args: search_args (iteratable): String element(s) to be each passed to ``search_func`` to create a regex pattern. Each element is ``re.escape``'d before being passed. search_func (callable): A 'template' function that should take a string and return a string. It should take an element of ``search_args`` and return a valid regex search pattern string. repl (string): The common replacement passed to the ``sub`` method for each ``regex``. Can be a raw string (the case of a regex backreference, for example) flags: ``re`` flag(s) to compile with each `regex`. Example: Add "!" after the words "lorem" or "ipsum", while ignoring case:: >>> import re >>> words = ['lorem', 'ipsum'] >>> pp = PreProcessorRegex(words, ... lambda x: "({})".format(x), r'\\1!', ... re.IGNORECASE) In this case, the regex is a group and the replacement uses its backreference ``\\1`` (as a raw string). Looking at ``pp`` we get the following list of search/replacement pairs:: >>> print(pp) (re.compile('(lorem)', re.IGNORECASE), repl='\1!'), (re.compile('(ipsum)', re.IGNORECASE), repl='\1!') It can then be run on any string of text:: >>> pp.run("LOREM ipSuM") "LOREM! ipSuM!" See :mod:`gtts.tokenizer.pre_processors` for more examples. """ def __init__(self, search_args, search_func, repl, flags=0): self.repl = repl # Create regex list self.regexes = [] for arg in search_args: rb = RegexBuilder([arg], search_func, flags) self.regexes.append(rb.regex)
[docs] def run(self, text): """Run each regex substitution on ``text``. Args: text (string): the input text. Returns: string: text after all substitutions have been sequentially applied. """ for regex in self.regexes: text = regex.sub(self.repl, text) return text
def __repr__(self): # pragma: no cover subs_strs = [] for r in self.regexes: subs_strs.append("({}, repl='{}')".format(r, self.repl)) return ", ".join(subs_strs)
[docs]class PreProcessorSub(): r"""Simple substitution text preprocessor. Performs string-for-string substitution from list a find/replace pairs. It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default simple substitution regex. Args: sub_pairs (list): A list of tuples of the style ``(<search str>, <replace str>)`` ignore_case (bool): Ignore case during search. Defaults to ``True``. Example: Replace all occurences of "Mac" to "PC" and "Firefox" to "Chrome":: >>> sub_pairs = [('Mac', 'PC'), ('Firefox', 'Chrome')] >>> pp = PreProcessorSub(sub_pairs) Looking at the ``pp``, we get the following list of search (regex)/replacement pairs:: >>> print(pp) (re.compile('Mac', re.IGNORECASE), repl='PC'), (re.compile('Firefox', re.IGNORECASE), repl='Chrome') It can then be run on any string of text:: >>> pp.run("I use firefox on my mac") "I use Chrome on my PC" See :mod:`gtts.tokenizer.pre_processors` for more examples. """ def __init__(self, sub_pairs, ignore_case=True): def search_func(x): return u"{}".format(x) flags = re.I if ignore_case else 0 # Create pre-processor list self.pre_processors = [] for sub_pair in sub_pairs: pattern, repl = sub_pair pp = PreProcessorRegex([pattern], search_func, repl, flags) self.pre_processors.append(pp)
[docs] def run(self, text): """Run each substitution on ``text``. Args: text (string): the input text. Returns: string: text after all substitutions have been sequentially applied. """ for pp in self.pre_processors: text = pp.run(text) return text
def __repr__(self): # pragma: no cover return ", ".join([str(pp) for pp in self.pre_processors])
[docs]class Tokenizer(): r"""An extensible but simple generic rule-based tokenizer. A generic and simple string tokenizer that takes a list of functions (called `tokenizer cases`) returning ``regex`` objects and joins them by "|" (regex alternation 'or') to create a single regex to use with the standard ``regex.split()`` function. ``regex_funcs`` is a list of any function that can return a ``regex`` (from ``re.compile()``) object, such as a :class:`gtts.tokenizer.core.RegexBuilder` instance (and its ``regex`` attribute). See the :mod:`gtts.tokenizer.tokenizer_cases` module for examples. Args: regex_funcs (list): List of compiled ``regex`` objects. Each functions's pattern will be joined into a single pattern and compiled. flags: ``re`` flag(s) to compile with the final regex. Defaults to ``re.IGNORECASE`` Note: When the ``regex`` objects obtained from ``regex_funcs`` are joined, their individual ``re`` flags are ignored in favour of ``flags``. Raises: TypeError: When an element of ``regex_funcs`` is not a function, or a function that does not return a compiled ``regex`` object. Warning: Joined ``regex`` patterns can easily interfere with one another in unexpected ways. It is recommanded that each tokenizer case operate on distinct or non-overlapping chracters/sets of characters (For example, a tokenizer case for the period (".") should also handle not matching/cutting on decimals, instead of making that a seperate tokenizer case). Example: A tokenizer with a two simple case (*Note: these are bad cases to tokenize on, this is simply a usage example*):: >>> import re, RegexBuilder >>> >>> def case1(): ... return re.compile("\,") >>> >>> def case2(): ... return RegexBuilder('abc', lambda x: "{}\.".format(x)).regex >>> >>> t = Tokenizer([case1, case2]) Looking at ``case1().pattern``, we get:: >>> print(case1().pattern) '\\,' Looking at ``case2().pattern``, we get:: >>> print(case2().pattern) 'a\\.|b\\.|c\\.' Finally, looking at ``t``, we get them combined:: >>> print(t) 're.compile('\\,|a\\.|b\\.|c\\.', re.IGNORECASE) from: [<function case1 at 0x10bbcdd08>, <function case2 at 0x10b5c5e18>]' It can then be run on any string of text:: >>> t.run("Hello, my name is Linda a. Call me Lin, b. I'm your friend") ['Hello', ' my name is Linda ', ' Call me Lin', ' ', " I'm your friend"] """ def __init__(self, regex_funcs, flags=re.IGNORECASE): self.regex_funcs = regex_funcs self.flags = flags try: # Combine self.total_regex = self._combine_regex() except (TypeError, AttributeError) as e: # pragma: no cover raise TypeError( "Tokenizer() expects a list of functions returning " "regular expression objects (i.e. re.compile). " + str(e)) def _combine_regex(self): alts = [] for func in self.regex_funcs: alts.append(func()) pattern = '|'.join(alt.pattern for alt in alts) return re.compile(pattern, self.flags)
[docs] def run(self, text): """Tokenize `text`. Args: text (string): the input text to tokenize. Returns: list: A list of strings (token) split according to the tokenizer cases. """ return self.total_regex.split(text)
def __repr__(self): # pragma: no cover return str(self.total_regex) + " from: " + str(self.regex_funcs)