Source code for gtts.tokenizer.pre_processors

# -*- coding: utf-8 -*-
from gtts.tokenizer import PreProcessorRegex, PreProcessorSub, symbols
import re



[docs]
def tone_marks(text):
    """Add a space after tone-modifying punctuation.

    Because the `tone_marks` tokenizer case will split after a tone-modifying
    punctuation mark, make sure there's whitespace after.

    """
    return PreProcessorRegex(
        search_args=symbols.TONE_MARKS,
        search_func=lambda x: u"(?<={})".format(x),
        repl=" ",
    ).run(text)




[docs]
def end_of_line(text):
    """Re-form words cut by end-of-line hyphens.

    Remove "<hyphen><newline>".

    """
    return PreProcessorRegex(
        search_args="-", search_func=lambda x: u"{}\n".format(x), repl=""
    ).run(text)




[docs]
def abbreviations(text):
    """Remove periods after an abbreviation from a list of known
    abbreviations that can be spoken the same without that period. This
    prevents having to handle tokenization of that period.

    Note:
        Could potentially remove the ending period of a sentence.

    Note:
        Abbreviations that Google Translate can't pronounce without
        (or even with) a period should be added as a word substitution with a
        :class:`PreProcessorSub` pre-processor. Ex.: 'Esq.', 'Esquire'.

    """
    return PreProcessorRegex(
        search_args=symbols.ABBREVIATIONS,
        search_func=lambda x: r"(?<={})(?=\.).".format(x),
        repl="",
        flags=re.IGNORECASE,
    ).run(text)




[docs]
def word_sub(text):
    """Word-for-word substitutions."""
    return PreProcessorSub(sub_pairs=symbols.SUB_PAIRS).run(text)