Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/source/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,20 @@ Considering the spider returns the following items:
'spidermon_item_scraped_count/dict/field4/field4.1/field4.1.2': 1
'spidermon_item_scraped_count/dict/field4/field4.1/field4.1.3': 1

SPIDERMON_FIELD_COVERAGE_TOLERANCE
-----------------------------------
Default: ``0``

A float value between 0 and 1 (representing 0% to 100%) that defines the tolerance for field coverage validation.

When set, this tolerance allows for small variations in field coverage to avoid false alarms when coverage is very close to the expected threshold. The monitor will only fail if the actual coverage plus the tolerance is still below the expected coverage.

For example, if you set a tolerance of 0.05 (5%) and expect 95% coverage for a field, the monitor will only fail if the actual coverage is below 90% (95% - 5%).

.. code-block:: python

SPIDERMON_FIELD_COVERAGE_TOLERANCE = 0.05 # 5% tolerance

SPIDERMON_MONITOR_SKIPPING_RULES
--------------------------------
Default: ``None``
Expand Down
27 changes: 24 additions & 3 deletions spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED"
)
SPIDERMON_ITEM_COUNT_INCREASE = "SPIDERMON_ITEM_COUNT_INCREASE"
SPIDERMON_FIELD_COVERAGE_TOLERANCE = "SPIDERMON_FIELD_COVERAGE_TOLERANCE"


@monitors.name("Extracted Items Monitor")
Expand Down Expand Up @@ -439,6 +440,18 @@ class MyCustomItem(scrapy.Item):
"MyCustomItem/field_2": 1.0,
}

You can also configure a tolerance setting to allow for small variations in field coverage.
This is useful to avoid false alarms when coverage is very close to the expected threshold.
Use the ``SPIDERMON_FIELD_COVERAGE_TOLERANCE`` setting to define the tolerance as a float
between 0 and 1 (representing 0% to 100%). The default value is 0 (no tolerance).

For example, if you set a tolerance of 0.05 (5%) and expect 95% coverage for a field,
the monitor will only fail if the actual coverage is below 90% (95% - 5%).

.. code-block:: python

SPIDERMON_FIELD_COVERAGE_TOLERANCE = 0.05 # 5% tolerance

"""

def run(self, result):
Expand All @@ -460,6 +473,14 @@ def test_check_if_field_coverage_rules_are_met(self):
if skip_no_items and int(items_scraped) == 0:
self.skipTest("No items were scraped.")

tolerance = self.crawler.settings.getfloat(
SPIDERMON_FIELD_COVERAGE_TOLERANCE, 0
)
if tolerance < 0 or tolerance > 1:
raise ValueError(
f"SPIDERMON_FIELD_COVERAGE_TOLERANCE must be between 0 and 1, got {tolerance}"
)

failures = []
field_coverage_rules = self.crawler.settings.getdict(
"SPIDERMON_FIELD_COVERAGE_RULES"
Expand All @@ -468,10 +489,10 @@ def test_check_if_field_coverage_rules_are_met(self):
actual_coverage = self.data.stats.get(
f"spidermon_field_coverage/{field}", 0
)
if actual_coverage < expected_coverage:
if actual_coverage + tolerance < expected_coverage:
failures.append(
"{} (expected {}, got {})".format(
field, expected_coverage, actual_coverage
"{} (expected {}, got {}, tolerance: {})".format(
field, expected_coverage, actual_coverage, tolerance
)
)

Expand Down
108 changes: 108 additions & 0 deletions tests/contrib/scrapy/test_monitors_field_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,111 @@ def test_monitor_skip_if_no_items_set_false(field_coverage_monitor_suite):
monitor_runner.run(field_coverage_monitor_suite, **data)

assert not monitor_runner.result.wasSuccessful()


def test_monitor_pass_with_tolerance_when_coverage_slightly_below_expected(
field_coverage_monitor_suite,
):
settings = {
"SPIDERMON_ADD_FIELD_COVERAGE": True,
"SPIDERMON_FIELD_COVERAGE_RULES": {
"dict/field": 0.95,
},
"SPIDERMON_FIELD_COVERAGE_TOLERANCE": 0.05,
}
stats = {"spidermon_field_coverage/dict/field": 0.91}
data = make_data_for_monitor(settings=settings, stats=stats)
monitor_runner = data.pop("runner")
monitor_runner.run(field_coverage_monitor_suite, **data)

assert monitor_runner.result.wasSuccessful()


def test_monitor_fail_with_tolerance_when_coverage_too_far_below_expected(
field_coverage_monitor_suite,
):
settings = {
"SPIDERMON_ADD_FIELD_COVERAGE": True,
"SPIDERMON_FIELD_COVERAGE_RULES": {
"dict/field": 0.95,
},
"SPIDERMON_FIELD_COVERAGE_TOLERANCE": 0.05,
}
stats = {"spidermon_field_coverage/dict/field": 0.89}
data = make_data_for_monitor(settings=settings, stats=stats)
monitor_runner = data.pop("runner")
monitor_runner.run(field_coverage_monitor_suite, **data)

assert not monitor_runner.result.wasSuccessful()


def test_monitor_pass_with_tolerance_when_coverage_exactly_at_tolerance_threshold(
field_coverage_monitor_suite,
):
settings = {
"SPIDERMON_ADD_FIELD_COVERAGE": True,
"SPIDERMON_FIELD_COVERAGE_RULES": {
"dict/field": 0.95,
},
"SPIDERMON_FIELD_COVERAGE_TOLERANCE": 0.05,
}
stats = {"spidermon_field_coverage/dict/field": 0.90}
data = make_data_for_monitor(settings=settings, stats=stats)
monitor_runner = data.pop("runner")
monitor_runner.run(field_coverage_monitor_suite, **data)

assert monitor_runner.result.wasSuccessful()


def test_monitor_default_tolerance_is_zero(field_coverage_monitor_suite):
settings = {
"SPIDERMON_ADD_FIELD_COVERAGE": True,
"SPIDERMON_FIELD_COVERAGE_RULES": {
"dict/field": 0.95,
},
# No tolerance setting - should default to 0
}
stats = {"spidermon_field_coverage/dict/field": 0.94}
data = make_data_for_monitor(settings=settings, stats=stats)
monitor_runner = data.pop("runner")
monitor_runner.run(field_coverage_monitor_suite, **data)

assert not monitor_runner.result.wasSuccessful()


def test_monitor_raise_value_error_for_invalid_tolerance_negative(
field_coverage_monitor_suite,
):
settings = {
"SPIDERMON_ADD_FIELD_COVERAGE": True,
"SPIDERMON_FIELD_COVERAGE_RULES": {
"dict/field": 0.8,
},
"SPIDERMON_FIELD_COVERAGE_TOLERANCE": -0.1,
}
stats = {"spidermon_field_coverage/dict/field": 0.8}
data = make_data_for_monitor(settings=settings, stats=stats)
monitor_runner = data.pop("runner")
with pytest.raises(
ValueError, match="SPIDERMON_FIELD_COVERAGE_TOLERANCE must be between 0 and 1"
):
monitor_runner.run(field_coverage_monitor_suite, **data)


def test_monitor_raise_value_error_for_invalid_tolerance_greater_than_one(
field_coverage_monitor_suite,
):
settings = {
"SPIDERMON_ADD_FIELD_COVERAGE": True,
"SPIDERMON_FIELD_COVERAGE_RULES": {
"dict/field": 0.8,
},
"SPIDERMON_FIELD_COVERAGE_TOLERANCE": 1.1,
}
stats = {"spidermon_field_coverage/dict/field": 0.8}
data = make_data_for_monitor(settings=settings, stats=stats)
monitor_runner = data.pop("runner")
with pytest.raises(
ValueError, match="SPIDERMON_FIELD_COVERAGE_TOLERANCE must be between 0 and 1"
):
monitor_runner.run(field_coverage_monitor_suite, **data)
Loading