diff --git a/docs/source/settings.rst b/docs/source/settings.rst index fc80468a..0c21aa9b 100644 --- a/docs/source/settings.rst +++ b/docs/source/settings.rst @@ -355,6 +355,20 @@ Considering the spider returns the following items: 'spidermon_item_scraped_count/dict/field4/field4.1/field4.1.2': 1 'spidermon_item_scraped_count/dict/field4/field4.1/field4.1.3': 1 +SPIDERMON_FIELD_COVERAGE_TOLERANCE +----------------------------------- +Default: ``0`` + +A float value between 0 and 1 (representing 0% to 100%) that defines the tolerance for field coverage validation. + +When set, this tolerance allows for small variations in field coverage to avoid false alarms when coverage is very close to the expected threshold. The monitor will only fail if the actual coverage plus the tolerance is still below the expected coverage. + +For example, if you set a tolerance of 0.05 (5%) and expect 95% coverage for a field, the monitor will only fail if the actual coverage is below 90% (95% - 5%). + +.. code-block:: python + + SPIDERMON_FIELD_COVERAGE_TOLERANCE = 0.05 # 5% tolerance + SPIDERMON_MONITOR_SKIPPING_RULES -------------------------------- Default: ``None`` diff --git a/spidermon/contrib/scrapy/monitors/monitors.py b/spidermon/contrib/scrapy/monitors/monitors.py index 3ea73fe1..e30534cb 100644 --- a/spidermon/contrib/scrapy/monitors/monitors.py +++ b/spidermon/contrib/scrapy/monitors/monitors.py @@ -28,6 +28,7 @@ "SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED" ) SPIDERMON_ITEM_COUNT_INCREASE = "SPIDERMON_ITEM_COUNT_INCREASE" +SPIDERMON_FIELD_COVERAGE_TOLERANCE = "SPIDERMON_FIELD_COVERAGE_TOLERANCE" @monitors.name("Extracted Items Monitor") @@ -439,6 +440,18 @@ class MyCustomItem(scrapy.Item): "MyCustomItem/field_2": 1.0, } + You can also configure a tolerance setting to allow for small variations in field coverage. + This is useful to avoid false alarms when coverage is very close to the expected threshold. + Use the ``SPIDERMON_FIELD_COVERAGE_TOLERANCE`` setting to define the tolerance as a float + between 0 and 1 (representing 0% to 100%). The default value is 0 (no tolerance). + + For example, if you set a tolerance of 0.05 (5%) and expect 95% coverage for a field, + the monitor will only fail if the actual coverage is below 90% (95% - 5%). + + .. code-block:: python + + SPIDERMON_FIELD_COVERAGE_TOLERANCE = 0.05 # 5% tolerance + """ def run(self, result): @@ -460,6 +473,14 @@ def test_check_if_field_coverage_rules_are_met(self): if skip_no_items and int(items_scraped) == 0: self.skipTest("No items were scraped.") + tolerance = self.crawler.settings.getfloat( + SPIDERMON_FIELD_COVERAGE_TOLERANCE, 0 + ) + if tolerance < 0 or tolerance > 1: + raise ValueError( + f"SPIDERMON_FIELD_COVERAGE_TOLERANCE must be between 0 and 1, got {tolerance}" + ) + failures = [] field_coverage_rules = self.crawler.settings.getdict( "SPIDERMON_FIELD_COVERAGE_RULES" @@ -468,10 +489,10 @@ def test_check_if_field_coverage_rules_are_met(self): actual_coverage = self.data.stats.get( f"spidermon_field_coverage/{field}", 0 ) - if actual_coverage < expected_coverage: + if actual_coverage + tolerance < expected_coverage: failures.append( - "{} (expected {}, got {})".format( - field, expected_coverage, actual_coverage + "{} (expected {}, got {}, tolerance: {})".format( + field, expected_coverage, actual_coverage, tolerance ) ) diff --git a/tests/contrib/scrapy/test_monitors_field_coverage.py b/tests/contrib/scrapy/test_monitors_field_coverage.py index 8debae47..b733ef16 100644 --- a/tests/contrib/scrapy/test_monitors_field_coverage.py +++ b/tests/contrib/scrapy/test_monitors_field_coverage.py @@ -143,3 +143,111 @@ def test_monitor_skip_if_no_items_set_false(field_coverage_monitor_suite): monitor_runner.run(field_coverage_monitor_suite, **data) assert not monitor_runner.result.wasSuccessful() + + +def test_monitor_pass_with_tolerance_when_coverage_slightly_below_expected( + field_coverage_monitor_suite, +): + settings = { + "SPIDERMON_ADD_FIELD_COVERAGE": True, + "SPIDERMON_FIELD_COVERAGE_RULES": { + "dict/field": 0.95, + }, + "SPIDERMON_FIELD_COVERAGE_TOLERANCE": 0.05, + } + stats = {"spidermon_field_coverage/dict/field": 0.91} + data = make_data_for_monitor(settings=settings, stats=stats) + monitor_runner = data.pop("runner") + monitor_runner.run(field_coverage_monitor_suite, **data) + + assert monitor_runner.result.wasSuccessful() + + +def test_monitor_fail_with_tolerance_when_coverage_too_far_below_expected( + field_coverage_monitor_suite, +): + settings = { + "SPIDERMON_ADD_FIELD_COVERAGE": True, + "SPIDERMON_FIELD_COVERAGE_RULES": { + "dict/field": 0.95, + }, + "SPIDERMON_FIELD_COVERAGE_TOLERANCE": 0.05, + } + stats = {"spidermon_field_coverage/dict/field": 0.89} + data = make_data_for_monitor(settings=settings, stats=stats) + monitor_runner = data.pop("runner") + monitor_runner.run(field_coverage_monitor_suite, **data) + + assert not monitor_runner.result.wasSuccessful() + + +def test_monitor_pass_with_tolerance_when_coverage_exactly_at_tolerance_threshold( + field_coverage_monitor_suite, +): + settings = { + "SPIDERMON_ADD_FIELD_COVERAGE": True, + "SPIDERMON_FIELD_COVERAGE_RULES": { + "dict/field": 0.95, + }, + "SPIDERMON_FIELD_COVERAGE_TOLERANCE": 0.05, + } + stats = {"spidermon_field_coverage/dict/field": 0.90} + data = make_data_for_monitor(settings=settings, stats=stats) + monitor_runner = data.pop("runner") + monitor_runner.run(field_coverage_monitor_suite, **data) + + assert monitor_runner.result.wasSuccessful() + + +def test_monitor_default_tolerance_is_zero(field_coverage_monitor_suite): + settings = { + "SPIDERMON_ADD_FIELD_COVERAGE": True, + "SPIDERMON_FIELD_COVERAGE_RULES": { + "dict/field": 0.95, + }, + # No tolerance setting - should default to 0 + } + stats = {"spidermon_field_coverage/dict/field": 0.94} + data = make_data_for_monitor(settings=settings, stats=stats) + monitor_runner = data.pop("runner") + monitor_runner.run(field_coverage_monitor_suite, **data) + + assert not monitor_runner.result.wasSuccessful() + + +def test_monitor_raise_value_error_for_invalid_tolerance_negative( + field_coverage_monitor_suite, +): + settings = { + "SPIDERMON_ADD_FIELD_COVERAGE": True, + "SPIDERMON_FIELD_COVERAGE_RULES": { + "dict/field": 0.8, + }, + "SPIDERMON_FIELD_COVERAGE_TOLERANCE": -0.1, + } + stats = {"spidermon_field_coverage/dict/field": 0.8} + data = make_data_for_monitor(settings=settings, stats=stats) + monitor_runner = data.pop("runner") + with pytest.raises( + ValueError, match="SPIDERMON_FIELD_COVERAGE_TOLERANCE must be between 0 and 1" + ): + monitor_runner.run(field_coverage_monitor_suite, **data) + + +def test_monitor_raise_value_error_for_invalid_tolerance_greater_than_one( + field_coverage_monitor_suite, +): + settings = { + "SPIDERMON_ADD_FIELD_COVERAGE": True, + "SPIDERMON_FIELD_COVERAGE_RULES": { + "dict/field": 0.8, + }, + "SPIDERMON_FIELD_COVERAGE_TOLERANCE": 1.1, + } + stats = {"spidermon_field_coverage/dict/field": 0.8} + data = make_data_for_monitor(settings=settings, stats=stats) + monitor_runner = data.pop("runner") + with pytest.raises( + ValueError, match="SPIDERMON_FIELD_COVERAGE_TOLERANCE must be between 0 and 1" + ): + monitor_runner.run(field_coverage_monitor_suite, **data)