'''This module contains helper functions for snapshot (or gold master) testing
FactoryTX transforms. A snapshot test takes CSV data and processes
it with the specified transform, comparing the output with that of a
previous run.
Example:
.. code-block:: Python
from factorytx.snapshot_test_utils import compare_with_snapshot
from factorytx.test_utils import (
csv_string_to_dataframe,
load_transform,
)
CONFIG = {
'transform_type': 'convert_timestamps',
'transform_name': 'Time Changing Transformer',
'filter_stream': ['*'],
'field_names': ["TS2", "TS4"],
'timezone': "Asia/Taipei"
}
INPUT_DATA = """\
TS1,TS2,col_3,TS4,description
2018-02-16 18:02:01,2018-02-16 18:02:01,1.1,2018-02-16 18:02:01,date 1
2018-05-27 01:12:01,2018-05-27 01:12:01,2.2,2018-05-27 01:12:01,date 2
2018-05-29 12:02:12,2018-05-29 12:02:12,3.3,2018-05-29 12:02:12,date 3
2018-10-29 00:00:29,2018-10-29 00:00:29,4.4,2018-10-29 00:00:29,date 4
2018-12-31 19:19:19,2018-12-31 19:19:19,5.5,2018-12-31 19:19:19,date 5
2019-07-04 15:07:04,2019-07-04 15:07:04,6.6,2019-07-04 15:07:04,date 6
"""
def test_snapshot_rename_transform(capsys, snapshot):
input_df = csv_string_to_dataframe(INPUT_DATA)
transform = load_transform(CONFIG)
transformed_df = transform.process(input_df)
with capsys.disabled():
compare_with_snapshot(transformed_df, snapshot, [])
Anatomy of a snapshot test:
* `CONFIG`: a dictionary-type variable that has key-value pairs necessary for
configuring the transform. The configuration is based on the transform's
schema.
* `INPUT_DATA`: CSV data that will be processed by the transform. The data can
come in the form of a multiline Python string or a CSV file saved in a
directory. It is suggested to only save data as a CSV file if the transform
needs to process a large amount of data. For an example of a snapshot test
using a CSV file, please refer to the snapshot test for the `Rename transform <https://github.com/sightmachine/factorytx-core/tree/master/tests/factorytx/transforms/test_snapshot_rename.py>`_.
* The test function should pass in `capsys` and `snapshot` as parameters.
* `snapshot` for reading and writing snapshot data
* `capsys` needs to be disabled so differences between the current output
and snapshot are formatted correctly in the Terminal.
* :func:`~factorytx.test_utils.csv_string_to_dataframe`: Converts the CSV
string buffer into a Pandas DataFrame for the transform to process.
* :func:`~factorytx.test_utils.load_transform`: Loads the FactoryTX
transform based on the provided configuration.
* :func:`~factorytx.snapshot_test_utils.compare_with_snapshot`: Compares the
output from the transform processing the data with the saved snapshot.
To run snapshot tests, please use Pytest:
.. code-block:: Python
pytest tests/factorytx/transforms/test_snapshot_rename.py
# To update a snapshot, add `--snapshot-update` to the command
pytest tests/factorytx/transforms/test_snapshot_rename.py --snapshot-update
**NOTE:** When a test is run for the first time, the snapshot will be
automatically created. Snapshots will be saved in the
`tests/.../transforms/snapshots` directory.
'''
import csv
from datetime import datetime
import numbers
import os
import re
import sys
from typing import Any, List
import daff
import numpy as np
import pandas as pd
import pytz
from snapshottest import module
# This file is shared with the FactoryTX integration tests, so it MUST NOT
# import any modules from factorytx. Please put factorytx-specific test code
# in test_utils.py instead.
Table = List[List[Any]]
def encode_value(value: Any) -> str:
if isinstance(value, numbers.Number):
return repr(value)
elif isinstance(value, str):
# Encode strings inside single quotes so that they are represented
# differently from other values. Otherwise the diff will omit changes
# from strings to other types or vice versa.
return f"'{value}'"
elif isinstance(value, list):
parts = []
parts.append('[')
for element in value:
parts.append(encode_value(element))
parts.append(', ')
parts.append(']')
return ''.join(parts)
elif isinstance(value, dict):
parts = []
parts.append('{')
for key, element in value.items():
parts.append(f'{key}: {encode_value(element)}')
parts.append(', ')
parts.append('}')
return ''.join(parts)
elif pd.isnull(value):
return 'None'
elif isinstance(value, (datetime, np.datetime64, pd.Timestamp)):
# As far as FactoryTX is concerned all datetime types are equivalent.
if isinstance(value, np.datetime64):
value = value.astype(datetime)
elif isinstance(value, pd.Timestamp):
value = value.to_pydatetime()
if value.tzinfo is not None:
value = value.astimezone(pytz.utc)
return value.strftime("%Y-%m-%d %H:%M:%S.%f")
else:
raise TypeError(f'Unsupported type "{type(value).__name__}": {value}')
def encode_table(table: Table) -> Table:
encoded = [table[0]] # Don't encode values in the header row.
encoded.extend([encode_value(x) for x in row] for row in table[1:])
return encoded
REQUIRES_ESCAPE_RE = re.compile('[",\n]')
def escape_value(value: str) -> str:
if REQUIRES_ESCAPE_RE.search(value):
value = f'''"{value.replace('"', '""')}"'''
return value
def escape_table(table: Table) -> Table:
escaped = [[escape_value(x) for x in row] for row in table]
return escaped
[docs]def load_table(s: str) -> Table:
"""Convert an aligned table from `format_table` back to an encoded table."""
csv_content = [row.replace('\0', '') for row in s.split('\n')]
reader = csv.reader(
csv_content,
delimiter=',',
doublequote=True,
quotechar='"',
strict=False
)
# All strings are wrapped in single quotes so we can safely strip values.
table = [[x.rstrip() for x in row] for row in reader if row]
return table
ANSI_ESCAPE_RE = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]')
def strip_ansi_escapes(s: str) -> str:
return ANSI_ESCAPE_RE.sub('', s)
def format_diff(a: Table, b: Table, index: List[str] = []) -> str:
flags = daff.CompareFlags()
for name in index:
flags.addPrimaryKey(name)
diff = daff.Coopy.diff(a, b, flags=flags)
if len(diff.data) > 1:
output = daff.TerminalDiffRender().render(diff)
# The terminal format setting is ignored by the renderer, so we need to
# strip escape sequences by hand. :-(
if not os.isatty(sys.stdout.fileno()):
output = strip_ansi_escapes(output)
else:
output = ""
return output
def dataframe_to_table(df: pd.DataFrame) -> Table:
# Sort columns to ensure that their order is consistent.
df = df.sort_index(axis=1)
rows = [df.columns.tolist()]
for idx, row in df.iterrows():
rows.append(row.tolist())
return rows
[docs]def compare_with_snapshot(input_df: pd.DataFrame,
snapshot: module.SnapshotTest,
index: List = []) -> None:
"""Transforms a provided DataFrame and compares it
with test's saved snapshot.
:param input_df: DataFrame converted from a CSV file
:param transform: FTX Transform to apply on the DataFrame
:param snapshot: A SnapshotTest that can read and write a saved Snapshot
:param index: Table columns used for how the table is displayed
"""
input_table = encode_table(dataframe_to_table(input_df))
# Because snapshots compare only between strings,
# to check data types we'll need to convert the
# saved snapshot and check for differences
res = snapshot.module[snapshot.test_name]
if not snapshot.update and res:
expected_table = load_table(res)
if input_table != expected_table:
output = format_diff(input_table, expected_table, index=index)
if output != "":
msg = f'Transform produced differences from the ' \
f'snapshot!\n{output}'
raise AssertionError(msg)
# Format the table to a good format for Git diff
# and as a string for `--snapshot-update` to overwrite
formatted_table = format_table(input_table)
snapshot.assert_match(formatted_table)