Skip to content

liken.datasets

liken.datasets.fake_10(backend='pandas', spark_session=None)

Synthetic 10 rows.

Parameters:

Name Type Description Default
backend liken.types.SupportedBackends

One of "pandas", "polars" or "spark".

'pandas'
spark_session pyspark.sql.SparkSession | None

The pyspark spark session if requesting data using "spark" backend.

None

Returns:

Type Description
liken.types.UserDataFrame

A dataframe, in the defined backend.

Raises:

Type Description
ValueError

if no spark session passed when requesting a spark dataframe.

Source code in src/liken/datasets.py
def fake_10(
    backend: SupportedBackends = "pandas",
    spark_session: SparkSession | None = None,
) -> UserDataFrame:
    """Synthetic 10 rows.

    Args:
        backend: One of "pandas", "polars" or "spark".
        spark_session: The pyspark spark session if requesting data using
            "spark" backend.

    Returns:
        A dataframe, in the defined backend.

    Raises:
        ValueError: if no spark session passed when requesting a spark dataframe.
    """
    return _return_df(
        schema=_SCHEMA10,
        data=_DATA10,
        backend=backend,
        spark_session=spark_session,
    )

liken.datasets.fake_1K(backend='pandas', spark_session=None)

Synthetic 1K (one thousand) rows.

Parameters:

Name Type Description Default
backend liken.types.SupportedBackends

One of "pandas", "polars" or "spark".

'pandas'
spark_session pyspark.sql.SparkSession | None

The pyspark spark session if requesting data using "spark" backend.

None

Returns:

Type Description
liken.types.UserDataFrame

A dataframe, in the defined backend.

Raises:

Type Description
ValueError

if no spark session passed when requesting a spark dataframe.

Source code in src/liken/datasets.py
def fake_1K(
    backend: SupportedBackends = "pandas",
    spark_session: SparkSession | None = None,
) -> UserDataFrame:
    """Synthetic 1K (one thousand) rows.

    Args:
        backend: One of "pandas", "polars" or "spark".
        spark_session: The pyspark spark session if requesting data using
            "spark" backend.

    Returns:
        A dataframe, in the defined backend.

    Raises:
        ValueError: if no spark session passed when requesting a spark dataframe.
    """
    data = [fake_row() for _ in range(999)]
    data.append(data[-1])  # duplicate last row for quick-glance

    return _return_df(
        schema=_SCHEMA10_PLUS,
        data=data,
        backend=backend,
        spark_session=spark_session,
    )

liken.datasets.fake_100K(backend='pandas', spark_session=None)

Synthetic 100K (one hundred thousand) rows.

Parameters:

Name Type Description Default
backend liken.types.SupportedBackends

One of "pandas", "polars" or "spark".

'pandas'
spark_session pyspark.sql.SparkSession | None

The pyspark spark session if requesting data using "spark" backend.

None

Returns:

Type Description
liken.types.UserDataFrame

A dataframe, in the defined backend.

Raises:

Type Description
ValueError

if no spark session passed when requesting a spark dataframe.

Source code in src/liken/datasets.py
def fake_100K(
    backend: SupportedBackends = "pandas",
    spark_session: SparkSession | None = None,
) -> UserDataFrame:
    """Synthetic 100K (one hundred thousand) rows.

    Args:
        backend: One of "pandas", "polars" or "spark".
        spark_session: The pyspark spark session if requesting data using
            "spark" backend.

    Returns:
        A dataframe, in the defined backend.

    Raises:
        ValueError: if no spark session passed when requesting a spark dataframe.
    """
    data = [fake_row() for _ in range(99_999)]
    data.append(data[-1])  # duplicate last row for quick-glance

    return _return_df(
        schema=_SCHEMA10_PLUS,
        data=data,
        backend=backend,
        spark_session=spark_session,
    )

liken.datasets.fake_1M(backend='pandas', spark_session=None)

Synthetic 1M (one million) rows.

Parameters:

Name Type Description Default
backend liken.types.SupportedBackends

One of "pandas", "polars" or "spark".

'pandas'
spark_session pyspark.sql.SparkSession | None

The pyspark spark session if requesting data using "spark" backend.

None

Returns:

Type Description
liken.types.UserDataFrame

A dataframe, in the defined backend.

Raises:

Type Description
ValueError

if no spark session passed when requesting a spark dataframe.

Source code in src/liken/datasets.py
def fake_1M(
    backend: SupportedBackends = "pandas",
    spark_session: SparkSession | None = None,
) -> UserDataFrame:
    """Synthetic 1M (one million) rows.

    Args:
        backend: One of "pandas", "polars" or "spark".
        spark_session: The pyspark spark session if requesting data using
            "spark" backend.

    Returns:
        A dataframe, in the defined backend.

    Raises:
        ValueError: if no spark session passed when requesting a spark dataframe.
    """
    data = [fake_row() for _ in range(999_999)]
    data.append(data[-1])  # duplicate last row for quick-glance

    return _return_df(
        schema=_SCHEMA10_PLUS,
        data=data,
        backend=backend,
        spark_session=spark_session,
    )