# -*- coding: utf-8 -*-
import re
[docs]
class RegexBuilder:
r"""Builds regex using arguments passed into a pattern template.
Builds a regex object for which the pattern is made from an argument
passed into a template. If more than one argument is passed (iterable),
each pattern is joined by "|" (regex alternation 'or') to create a
single pattern.
Args:
pattern_args (iterable): String element(s) to be each passed to
``pattern_func`` to create a regex pattern. Each element is
``re.escape``'d before being passed.
pattern_func (callable): A 'template' function that should take a
string and return a string. It should take an element of
``pattern_args`` and return a valid regex pattern group string.
flags: ``re`` flag(s) to compile with the regex.
Example:
To create a simple regex that matches on the characters "a", "b",
or "c", followed by a period::
>>> rb = RegexBuilder('abc', lambda x: "{}\.".format(x))
Looking at ``rb.regex`` we get the following compiled regex::
>>> print(rb.regex)
'a\.|b\.|c\.'
The above is fairly simple, but this class can help in writing more
complex repetitive regex, making them more readable and easier to
create by using existing data structures.
Example:
To match the character following the words "lorem", "ipsum", "meili"
or "koda"::
>>> words = ['lorem', 'ipsum', 'meili', 'koda']
>>> rb = RegexBuilder(words, lambda x: "(?<={}).".format(x))
Looking at ``rb.regex`` we get the following compiled regex::
>>> print(rb.regex)
'(?<=lorem).|(?<=ipsum).|(?<=meili).|(?<=koda).'
"""
def __init__(self, pattern_args, pattern_func, flags=0):
self.pattern_args = pattern_args
self.pattern_func = pattern_func
self.flags = flags
# Compile
self.regex = self._compile()
def _compile(self):
alts = []
for arg in self.pattern_args:
arg = re.escape(arg)
alt = self.pattern_func(arg)
alts.append(alt)
pattern = "|".join(alts)
return re.compile(pattern, self.flags)
def __repr__(self): # pragma: no cover
return str(self.regex)
[docs]
class PreProcessorRegex:
r"""Regex-based substitution text pre-processor.
Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a
:class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl``
replacement parameter.
Args:
search_args (iterable): String element(s) to be each passed to
``search_func`` to create a regex pattern. Each element is
``re.escape``'d before being passed.
search_func (callable): A 'template' function that should take a
string and return a string. It should take an element of
``search_args`` and return a valid regex search pattern string.
repl (string): The common replacement passed to the ``sub`` method for
each ``regex``. Can be a raw string (the case of a regex
backreference, for example)
flags: ``re`` flag(s) to compile with each `regex`.
Example:
Add "!" after the words "lorem" or "ipsum", while ignoring case::
>>> import re
>>> words = ['lorem', 'ipsum']
>>> pp = PreProcessorRegex(words,
... lambda x: "({})".format(x), r'\\1!',
... re.IGNORECASE)
In this case, the regex is a group and the replacement uses its
backreference ``\\1`` (as a raw string). Looking at ``pp`` we get the
following list of search/replacement pairs::
>>> print(pp)
(re.compile('(lorem)', re.IGNORECASE), repl='\1!'),
(re.compile('(ipsum)', re.IGNORECASE), repl='\1!')
It can then be run on any string of text::
>>> pp.run("LOREM ipSuM")
"LOREM! ipSuM!"
See :mod:`gtts.tokenizer.pre_processors` for more examples.
"""
def __init__(self, search_args, search_func, repl, flags=0):
self.repl = repl
# Create regex list
self.regexes = []
for arg in search_args:
rb = RegexBuilder([arg], search_func, flags)
self.regexes.append(rb.regex)
[docs]
def run(self, text):
"""Run each regex substitution on ``text``.
Args:
text (string): the input text.
Returns:
string: text after all substitutions have been sequentially
applied.
"""
for regex in self.regexes:
text = regex.sub(self.repl, text)
return text
def __repr__(self): # pragma: no cover
subs_strs = []
for r in self.regexes:
subs_strs.append("({}, repl='{}')".format(r, self.repl))
return ", ".join(subs_strs)
[docs]
class PreProcessorSub:
r"""Simple substitution text preprocessor.
Performs string-for-string substitution from list a find/replace pairs.
It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default
simple substitution regex.
Args:
sub_pairs (list): A list of tuples of the style
``(<search str>, <replace str>)``
ignore_case (bool): Ignore case during search. Defaults to ``True``.
Example:
Replace all occurrences of "Mac" to "PC" and "Firefox" to "Chrome"::
>>> sub_pairs = [('Mac', 'PC'), ('Firefox', 'Chrome')]
>>> pp = PreProcessorSub(sub_pairs)
Looking at the ``pp``, we get the following list of
search (regex)/replacement pairs::
>>> print(pp)
(re.compile('Mac', re.IGNORECASE), repl='PC'),
(re.compile('Firefox', re.IGNORECASE), repl='Chrome')
It can then be run on any string of text::
>>> pp.run("I use firefox on my mac")
"I use Chrome on my PC"
See :mod:`gtts.tokenizer.pre_processors` for more examples.
"""
def __init__(self, sub_pairs, ignore_case=True):
def search_func(x):
return u"{}".format(x)
flags = re.I if ignore_case else 0
# Create pre-processor list
self.pre_processors = []
for sub_pair in sub_pairs:
pattern, repl = sub_pair
pp = PreProcessorRegex([pattern], search_func, repl, flags)
self.pre_processors.append(pp)
[docs]
def run(self, text):
"""Run each substitution on ``text``.
Args:
text (string): the input text.
Returns:
string: text after all substitutions have been sequentially
applied.
"""
for pp in self.pre_processors:
text = pp.run(text)
return text
def __repr__(self): # pragma: no cover
return ", ".join([str(pp) for pp in self.pre_processors])
[docs]
class Tokenizer:
r"""An extensible but simple generic rule-based tokenizer.
A generic and simple string tokenizer that takes a list of functions
(called `tokenizer cases`) returning ``regex`` objects and joins them by
"|" (regex alternation 'or') to create a single regex to use with the
standard ``regex.split()`` function.
``regex_funcs`` is a list of any function that can return a ``regex``
(from ``re.compile()``) object, such as a
:class:`gtts.tokenizer.core.RegexBuilder` instance (and its ``regex``
attribute).
See the :mod:`gtts.tokenizer.tokenizer_cases` module for examples.
Args:
regex_funcs (list): List of compiled ``regex`` objects. Each
function's pattern will be joined into a single pattern and
compiled.
flags: ``re`` flag(s) to compile with the final regex. Defaults to
``re.IGNORECASE``
Note:
When the ``regex`` objects obtained from ``regex_funcs`` are joined,
their individual ``re`` flags are ignored in favour of ``flags``.
Raises:
TypeError: When an element of ``regex_funcs`` is not a function, or
a function that does not return a compiled ``regex`` object.
Warning:
Joined ``regex`` patterns can easily interfere with one another in
unexpected ways. It is recommended that each tokenizer case operate
on distinct or non-overlapping characters/sets of characters
(For example, a tokenizer case for the period (".") should also
handle not matching/cutting on decimals, instead of making that
a separate tokenizer case).
Example:
A tokenizer with a two simple case (*Note: these are bad cases to
tokenize on, this is simply a usage example*)::
>>> import re, RegexBuilder
>>>
>>> def case1():
... return re.compile("\,")
>>>
>>> def case2():
... return RegexBuilder('abc', lambda x: "{}\.".format(x)).regex
>>>
>>> t = Tokenizer([case1, case2])
Looking at ``case1().pattern``, we get::
>>> print(case1().pattern)
'\\,'
Looking at ``case2().pattern``, we get::
>>> print(case2().pattern)
'a\\.|b\\.|c\\.'
Finally, looking at ``t``, we get them combined::
>>> print(t)
're.compile('\\,|a\\.|b\\.|c\\.', re.IGNORECASE)
from: [<function case1 at 0x10bbcdd08>, <function case2 at 0x10b5c5e18>]'
It can then be run on any string of text::
>>> t.run("Hello, my name is Linda a. Call me Lin, b. I'm your friend")
['Hello', ' my name is Linda ', ' Call me Lin', ' ', " I'm your friend"]
"""
def __init__(self, regex_funcs, flags=re.IGNORECASE):
self.regex_funcs = regex_funcs
self.flags = flags
try:
# Combine
self.total_regex = self._combine_regex()
except (TypeError, AttributeError) as e: # pragma: no cover
raise TypeError(
"Tokenizer() expects a list of functions returning "
"regular expression objects (i.e. re.compile). " + str(e)
)
def _combine_regex(self):
alts = []
for func in self.regex_funcs:
alts.append(func())
pattern = "|".join(alt.pattern for alt in alts)
return re.compile(pattern, self.flags)
[docs]
def run(self, text):
"""Tokenize `text`.
Args:
text (string): the input text to tokenize.
Returns:
list: A list of strings (token) split according to the tokenizer cases.
"""
return self.total_regex.split(text)
def __repr__(self): # pragma: no cover
return str(self.total_regex) + " from: " + str(self.regex_funcs)