Skip to content

Commit 3bbf35e

Browse files
Cache multiregex matchers instead of patterns
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent 4438526 commit 3bbf35e

File tree

6 files changed

+71
-63
lines changed

6 files changed

+71
-63
lines changed

src/packagedcode/cache.py

Lines changed: 41 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@
88
#
99

1010
import os
11-
import json
12-
import attr
1311
import fnmatch
12+
import pickle
13+
import multiregex
1414

15-
from commoncode.fileutils import create_dir
15+
import attr
1616

17+
from commoncode.fileutils import create_dir
1718
from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS
1819
from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS
1920

@@ -29,6 +30,9 @@
2930
# global in-memory cache of the PkgManifestPatternsCache
3031
_PACKAGE_CACHE = None
3132

33+
# This is the Pickle protocol we use, which was added in Python 3.4.
34+
PICKLE_PROTOCOL = 4
35+
3236
PACKAGE_INDEX_LOCK_TIMEOUT = 60 * 6
3337
PACKAGE_INDEX_DIR = 'package_patterns_index'
3438
PACKAGE_INDEX_FILENAME = 'index_cache'
@@ -45,23 +49,21 @@ class PkgManifestPatternsCache:
4549
"""
4650

4751
handler_by_regex = attr.ib(default=attr.Factory(dict))
48-
system_multiregex_patterns = attr.ib(default=attr.Factory(list))
49-
application_multiregex_patterns = attr.ib(default=attr.Factory(list))
52+
system_package_matcher = attr.ib(default=None)
53+
application_package_matcher = attr.ib(default=None)
54+
all_package_matcher = attr.ib(default=None)
5055

5156
@staticmethod
52-
def all_multiregex_patterns(self):
53-
return self.application_multiregex_patterns + [
57+
def all_multiregex_patterns(application_multiregex_patterns, system_multiregex_patterns):
58+
return application_multiregex_patterns + [
5459
multiregex_pattern
55-
for multiregex_pattern in self.system_multiregex_patterns
56-
if multiregex_pattern not in self.application_multiregex_patterns
60+
for multiregex_pattern in system_multiregex_patterns
61+
if multiregex_pattern not in application_multiregex_patterns
5762
]
5863

5964
@classmethod
60-
def from_mapping(cls, cache_mapping):
61-
return cls(**cache_mapping)
62-
63-
@staticmethod
6465
def load_or_build(
66+
cls,
6567
packagedcode_cache_dir=packagedcode_cache_dir,
6668
scancode_cache_dir=scancode_cache_dir,
6769
force=False,
@@ -94,7 +96,6 @@ def load_or_build(
9496
print(str(e))
9597
print(traceback.format_exc())
9698

97-
9899
from scancode import lockfile
99100
lock_file = os.path.join(scancode_cache_dir, PACKAGE_LOCKFILE_NAME)
100101

@@ -109,29 +110,31 @@ def load_or_build(
109110
application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns(
110111
datafile_handlers=application_package_datafile_handlers,
111112
)
112-
package_cache = PkgManifestPatternsCache(
113+
all_multiregex_matcher = PkgManifestPatternsCache.all_multiregex_patterns(
114+
application_multiregex_patterns, system_multiregex_patterns,
115+
)
116+
system_package_matcher = multiregex.RegexMatcher(system_multiregex_patterns)
117+
application_package_matcher = multiregex.RegexMatcher(application_multiregex_patterns)
118+
all_package_matcher = multiregex.RegexMatcher(all_multiregex_matcher)
119+
package_cache = cls(
113120
handler_by_regex=system_handlers_by_regex | application_handlers_by_regex,
114-
system_multiregex_patterns=system_multiregex_patterns,
115-
application_multiregex_patterns=application_multiregex_patterns,
121+
system_package_matcher=system_package_matcher,
122+
application_package_matcher=application_package_matcher,
123+
all_package_matcher=all_package_matcher,
116124
)
117125
package_cache.dump(cache_file)
118126
return package_cache
119127

120128
except lockfile.LockTimeout:
121129
# TODO: handle unable to lock in a nicer way
122-
raise
130+
raise
123131

124132
def dump(self, cache_file):
125133
"""
126-
Dump this package cache on disk at ``cache_file``.
134+
Dump this license cache on disk at ``cache_file``.
127135
"""
128-
package_cache = {
129-
"handler_by_regex": self.handler_by_regex,
130-
"system_multiregex_patterns": self.system_multiregex_patterns,
131-
"application_multiregex_patterns": self.application_multiregex_patterns,
132-
}
133-
with open(cache_file, 'w') as f:
134-
json.dump(package_cache, f)
136+
with open(cache_file, 'wb') as fn:
137+
pickle.dump(self, fn, protocol=PICKLE_PROTOCOL)
135138

136139

137140
def get_prematchers_from_glob_pattern(pattern):
@@ -203,20 +206,16 @@ def get_cache(
203206

204207
def load_cache_file(cache_file):
205208
"""
206-
Return a PkgManifestPatternsCache loaded from JSON ``cache_file``.
209+
Return a PkgManifestPatternsCache loaded from ``cache_file``.
207210
"""
208-
with open(cache_file) as f:
209-
cache = json.load(f)
210-
211-
# convert multiregex patterns from list to tuples while loading
212-
cache_transformed = {"handler_by_regex": cache.get("handler_by_regex")}
213-
cache_transformed["system_multiregex_patterns"] = [
214-
tuple(multiregex_pattern)
215-
for multiregex_pattern in cache.get("system_multiregex_patterns")
216-
]
217-
cache_transformed["application_multiregex_patterns"] = [
218-
tuple(multiregex_pattern)
219-
for multiregex_pattern in cache.get("application_multiregex_patterns")
220-
]
221-
222-
return PkgManifestPatternsCache.from_mapping(cache_transformed)
211+
with open(cache_file, 'rb') as lfc:
212+
try:
213+
return pickle.load(lfc)
214+
except Exception as e:
215+
msg = (
216+
'ERROR: Failed to load package cache (the file may be corrupted ?).\n'
217+
f'Please delete "{cache_file}" and retry.\n'
218+
'If the problem persists, copy this error message '
219+
'and submit a bug report at https://github.com/nexB/scancode-toolkit/issues/'
220+
)
221+
raise Exception(msg) from e

src/packagedcode/recognize.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
import os
1111
import sys
1212

13-
import multiregex
14-
1513
from commoncode import filetype
1614
from commoncode.fileutils import as_posixpath
1715

@@ -87,13 +85,12 @@ def _parse(
8785

8886
assert application or system or package_only
8987
if package_only or (application and system):
90-
multiregex_patterns = package_patterns.all_multiregex_patterns
88+
package_matcher = package_patterns.all_package_matcher
9189
elif application:
92-
multiregex_patterns = package_patterns.application_multiregex_patterns
90+
package_matcher = package_patterns.application_package_matcher
9391
elif system:
94-
multiregex_patterns = package_patterns.system_multiregex_patterns
92+
package_matcher = package_patterns.system_package_matcher
9593

96-
package_matcher = multiregex.RegexMatcher(multiregex_patterns)
9794
matched_patterns = package_matcher.match(package_path)
9895

9996
datafile_handlers = []
@@ -103,19 +100,14 @@ def _parse(
103100
if TRACE:
104101
logger_debug(f'_parse:.handler_ids: {handler_ids}')
105102

106-
datafile_handlers = [
103+
datafile_handlers.extend([
107104
HANDLER_BY_DATASOURCE_ID.get(handler_id)
108105
for handler_id in handler_ids
109-
]
106+
])
110107

111108
if not datafile_handlers:
112-
if BINARY_HANDLERS_PRESENT:
113-
datafile_handlers = BINARY_PACKAGE_DATAFILE_HANDLERS
114-
else:
115-
if TRACE:
116-
logger_debug(f'_parse: no package datafile detected at {package_path}')
117-
118-
return
109+
if TRACE:
110+
logger_debug(f'_parse: no package datafile detected at {package_path}')
119111

120112
for handler in datafile_handlers:
121113
if TRACE:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/package_patterns_index/

tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/packagedcode/test_cache.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import os.path
1111

1212
from packagedcode import cache
13+
from commoncode.fileutils import as_posixpath
14+
1315
from packages_test_utils import PackageTester
1416
from scancode_config import REGEN_TEST_FIXTURES
1517
from scancode.cli_test_utils import run_scan_click
@@ -33,17 +35,27 @@ def test_build_mappings_and_multiregex_patterns_works(self):
3335

3436
def test_build_package_cache_works(self):
3537
from packagedcode.about import AboutFileHandler
38+
from packagedcode.bower import BowerJsonHandler
3639

37-
package_cache_dir = self.get_test_loc('cache/package_patterns_index')
40+
package_cache_dir = self.get_test_loc('cache/')
3841
package_cache = cache.PkgManifestPatternsCache.load_or_build(
3942
packagedcode_cache_dir=package_cache_dir,
4043
application_package_datafile_handlers=[AboutFileHandler],
41-
system_package_datafile_handlers=[],
44+
system_package_datafile_handlers=[BowerJsonHandler],
4245
force=True,
4346
)
47+
test_path = "scancode-toolkit.ABOUT"
48+
49+
assert not package_cache.system_package_matcher.match(test_path)
50+
assert package_cache.application_package_matcher.match(test_path)
4451

45-
assert not package_cache.system_multiregex_patterns
46-
assert len(package_cache.application_multiregex_patterns) == 1
47-
assert '(?s:.*\\.ABOUT)\\Z' in package_cache.handler_by_regex
52+
regex, _match = package_cache.all_package_matcher.match(test_path).pop()
53+
assert package_cache.handler_by_regex.get(regex.pattern).pop() == AboutFileHandler.datasource_id
54+
55+
def check_empty_file_scan_works(self):
4856

57+
test_file = self.get_test_loc('cache/.gitignore')
58+
package_path = as_posixpath(test_file)
59+
package_matcher = cache.get_cache()
4960

61+
assert not package_matcher.match(package_path)

tests/packagedcode/test_recognize.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,3 +202,8 @@ def test_recognize_rpmdb_sqlite(self):
202202
packages = recognize_package_data(test_file, system=True)
203203
assert packages
204204
assert isinstance(packages[0], models.PackageData)
205+
206+
def test_recognize_non_package_manifest_file(self):
207+
test_file = self.get_test_loc('cache/.gitignore')
208+
packages = recognize_package_data(test_file)
209+
assert not packages

0 commit comments

Comments
 (0)