Skip to content

Commit 67c8d14

Browse files
committed
fix: add a new tnm regex pattern by default.
1 parent 8f9847a commit 67c8d14

File tree

3 files changed

+97
-5
lines changed

3 files changed

+97
-5
lines changed

edsnlp/pipes/ner/tnm/patterns.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
prefix_pattern = r"(?P<prefix>[cpPyraum]p?)"
1+
prefix_pattern = r"(?P<tumour_prefix>[cpPyraum]p?)"
22
tumour_pattern = r"T\s?(?P<tumour>([0-4o]|is))?(?P<tumour_specification>[abcdx]|mi)?"
33
tumour_pattern += r"(?:\((?P<tumour_suffix>[^()]{1,10})\))?"
44
node_pattern = r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}N\s?(?P<node>[0-3o]|x)"
@@ -9,7 +9,7 @@
99
metastasis_pattern = (
1010
r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}M\s?(?P<metastasis>([01o]|x))x?)" # noqa: E501
1111
)
12-
resection_completeness = r"(\s{,2}\/?\s{,2}R\s?(?P<resection_completeness>[012]))"
12+
resection_pattern = r"(\s{,2}\/?\s{,2}R\s?(?P<resection>[012]))"
1313

1414
version_pattern = (
1515
r"\(?(?P<version>uicc|accj|tnm|UICC|ACCJ|TNM)"
@@ -23,6 +23,6 @@
2323
tnm_pattern += prefix_pattern + r"\s{,2}?" + f"({tumour_pattern})"
2424
tnm_pattern += r"(\s{,2}" + f"{node_pattern})?"
2525
tnm_pattern += r"(\s{,2}" + f"{metastasis_pattern})?"
26-
tnm_pattern += r"(\s{,2}" + f"{resection_completeness})?"
26+
tnm_pattern += r"(\s{,2}" + f"{resection_pattern})?"
2727
tnm_pattern += f"({spacer}{version_pattern})?"
2828
tnm_pattern = r"(?:\b|^)" + tnm_pattern + r"(?:\b|$)"
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
tumour_pattern = (
2+
r"(?P<tumour_prefix>[cpyramP]{1,2}\s?)?" # Optional tumour prefix
3+
r"T\s?" # 'T' followed by optional space
4+
r"(?P<tumour>([0-4]|is|[Xx]))" # Tumour size (required if 'T' is present)
5+
r"(?P<tumour_specification>[abcdx]|mi)?" # Optional tumour specification
6+
r"(?:\s?\((?P<tumour_suffix>[^()]{1,10})\))?" # Optional tumour suffix
7+
)
8+
9+
node_pattern = (
10+
r"(?P<node_prefix>[cpyraP]{1,2}\s?)?" # Optional node prefix
11+
r"N\s?" # 'N' followed by optional space
12+
r"(?P<node>[Xx01234\+])" # Node size/status (required if 'N' is present)
13+
r"(?P<node_specification>[abcdx]|mi|sn|i[-,+]|mol[-,+]|\(mi\)|\(sn\)|"
14+
r"\(i[-,+]\)|\(mol[-,+]\)|\(\d+\s*/\s*\d+\))?" # Optional node specification
15+
r"(?:\s?\((?P<node_suffix>[^()]{1,10})\))?" # Optional node suffix
16+
)
17+
18+
metastasis_pattern = (
19+
r"(?P<metastasis_prefix>[cpyraP]{1,2}\s?)?" # Optional metastasis prefix
20+
r"M\s?" # 'M' followed by optional space
21+
r"(?P<metastasis>[Xx0123\+])" # Metastasis status (required if 'M' is present)
22+
r"(?P<metastasis_specification>[abcd]|i\+|mol\+|cy\+|\(i\+\)|\(mol\+\)|"
23+
r"\(cy\+\)|PUL|OSS|HEP|BRA|LYM|OTH|MAR|PLE|PER|ADR|SKI)?" # Optional specification
24+
)
25+
26+
pleura_pattern = (
27+
r"PL\s?(?P<pleura>([0123]|x))?" # Optional pleura status (for lung cancer)
28+
)
29+
30+
resection_pattern = (
31+
r"R\s?"
32+
r"(?P<resection>[Xx012])?" # Optional resection completeness
33+
r"(?P<resection_specification>(is|cy\+|\(is\)|\(cy\+\))?)?" # Optional spec
34+
r"(?P<resection_loc>(\((?P<r_loc>[a-z]+)\)[,;\s]*)*)?" # Optional localization
35+
)
36+
37+
version_pattern = (
38+
r"\(?(?P<version>uicc|accj|tnm|UICC|ACCJ|TNM)" # TNM version
39+
r"\s+([éeE]ditions|[éeE]d\.?)?\s*"
40+
r"(?P<version_year>\d{4}|\d{2})\)?" # Year of the version
41+
)
42+
43+
TNM_space = r"(\s*[,\/]?\s*|\n)" # Allow space, comma, or slash as delimiters
44+
45+
# We need te exclude pattern like 'T1', 'T2' if they are not followed by node or
46+
# metastasis sections.
47+
exclude_pattern = (
48+
r"(?!T\s*[0-4]\s*[.,\/](?!\s*"
49+
+ node_pattern
50+
+ "?"
51+
+ TNM_space
52+
+ "?"
53+
+ metastasis_pattern
54+
+ "?"
55+
+ "))"
56+
)
57+
58+
tnm_pattern_new = (
59+
r"(?:\b|^)"
60+
+ exclude_pattern
61+
+ r"(?:"
62+
+ r"(?P<T_component>"
63+
+ tumour_pattern
64+
+ ")"
65+
+ TNM_space
66+
+ "?"
67+
+ r"(?P<N_component>"
68+
+ node_pattern
69+
+ ")?"
70+
+ TNM_space
71+
+ "?"
72+
+ r"(?P<M_component>"
73+
+ metastasis_pattern
74+
+ ")?"
75+
+ TNM_space
76+
+ "?"
77+
+ r"(?P<PL_component>"
78+
+ pleura_pattern
79+
+ ")?"
80+
+ TNM_space
81+
+ "?"
82+
+ r"(?P<R_component>"
83+
+ resection_pattern
84+
+ ")?"
85+
+ TNM_space
86+
+ "?"
87+
+ r"(?P<V_component>"
88+
+ version_pattern
89+
+ ")?"
90+
+ r")"
91+
+ r"(?:\b|$|\n)"
92+
)

edsnlp/pipes/ner/tnm/tnm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from edsnlp.utils.typing import cast
1313

1414
from .model import TNM
15-
from .patterns import tnm_pattern
15+
from .patterns_new import tnm_pattern_new
1616

1717

1818
class TNMMatcher(BaseNERComponent):
@@ -75,7 +75,7 @@ def __init__(
7575
nlp: Optional[PipelineProtocol],
7676
name: str = "tnm",
7777
*,
78-
pattern: Optional[Union[List[str], str]] = tnm_pattern,
78+
pattern: Optional[Union[List[str], str]] = tnm_pattern_new,
7979
attr: str = "TEXT",
8080
label: str = "tnm",
8181
span_setter: SpanSetterArg = {"ents": True, "tnm": True},

0 commit comments

Comments
 (0)