Skip to content

liken.preprocessors

liken.preprocessors.strip()

Remove leading/trailing whitespace.

Source code in src/liken/preprocessors.py
def strip() -> Strip:
    """Remove leading/trailing whitespace."""
    return Strip()

liken.preprocessors.lower()

Convert strings to lowercase.

Source code in src/liken/preprocessors.py
def lower() -> Lower:
    """Convert strings to lowercase."""
    return Lower()

liken.preprocessors.alnum()

Remove non-alphanumeric characters, including spaces.

Source code in src/liken/preprocessors.py
def alnum() -> Alnum:
    """Remove non-alphanumeric characters, including spaces."""
    return Alnum()

liken.preprocessors.remove_punctuation()

Remove punctuation but preserve spaces.

Source code in src/liken/preprocessors.py
def remove_punctuation() -> RemovePunctuation:
    """Remove punctuation but preserve spaces."""
    return RemovePunctuation()

liken.preprocessors.normalize_unicode(form='NFKD')

Normalize Unicode strings.

Parameters:

Name Type Description Default
form typing.Literal['NFC', 'NFKC', 'NFD', 'NFKD']

Unicode normalization form. Accepted values are "NFC", "NFKC", "NFD", "NFKD".

'NFKD'
Source code in src/liken/preprocessors.py
def normalize_unicode(form: Literal["NFC", "NFKC", "NFD", "NFKD"] = "NFKD") -> NormalizeUnicode:
    """Normalize Unicode strings.

    Args:
        form: Unicode normalization form. Accepted values are "NFC", "NFKC",
            "NFD", "NFKD".
    """
    return NormalizeUnicode(form=form)

liken.preprocessors.ascii_fold()

Converts alphabetic, numeric, and symbolic characters that are not in the Basic Latin Unicode block (first 127 ASCII characters) to their ASCII equivalent, if one exists. For example, the filter changes à to a.

Source code in src/liken/preprocessors.py
def ascii_fold() -> AsciiFold:
    """Converts alphabetic, numeric, and symbolic characters that are not in
    the Basic Latin Unicode block (first 127 ASCII characters) to their ASCII
    equivalent, if one exists. For example, the filter changes à to a.
    """
    return AsciiFold()

liken.preprocessors.remove_stopwords(words=None, language='english')

Remove stopwords.

Parameters:

Name Type Description Default
words list[str] | None

A list of words to ignore. If defined, language argument is ignored.

None
language str

The language to use for the stop words dictionary

'english'
Source code in src/liken/preprocessors.py
def remove_stopwords(
    words: list[str] | None = None,
    language: str = "english",
) -> RemoveStopwords:
    """Remove stopwords.

    Args:
        words: A list of words to ignore. If defined, `language` argument is
            ignored.
        language: The language to use for the stop words dictionary"""
    return RemoveStopwords(words=words, language=language)

liken.preprocessors.normalize_names()

Normalize personal names.

Preserves only first name, middle name and last name. Titles and nicknames are stripped. Commas are cleaned.

Source code in src/liken/preprocessors.py
def normalize_names() -> NormalizeName:
    """Normalize personal names.

    Preserves only first name, middle name and last name. Titles and nicknames
    are stripped. Commas are cleaned.
    """
    return NormalizeName()

liken.preprocessors.normalize_company()

Normalize company names.

Strips common company name nomenclature e.g. "Ltd.", or "LLC".

Source code in src/liken/preprocessors.py
def normalize_company() -> NormalizeCompany:
    """Normalize company names.

    Strips common company name nomenclature e.g. "Ltd.", or "LLC".
    """
    return NormalizeCompany()