Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ _build/
*.tar.gz
*.tsv
*.ann
*.spacy
*.RRF
*.pkl

# Editors
.idea
Expand All @@ -71,3 +74,6 @@ _build/
docs/reference
docs/changelog.md
docs/contributing.md

# Logs
*.log
57 changes: 0 additions & 57 deletions .pre-commit-config.yaml

This file was deleted.

185 changes: 185 additions & 0 deletions NER_model/configs/config.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
[paths]
train = "corpus/train.spacy"
dev = "corpus/dev.spacy"
vectors = null
init_tok2vec = null
bert = "../../word-embedding/finetuning-camembert-2021-07-29"

[system]
gpu_allocator = "pytorch"
seed = 0

[nlp]
lang = "eds"
pipeline = ["tok2vec","ner","qualifier"]
batch_size = 1
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null

[components]

[components.ner]
factory = "nested_ner"
scorer = {"@scorers": "eds.nested_ner_scorer.v1"}
ent_labels = ["DISO","Constantes","BIO_comp","Chemical_and_drugs","dosage","BIO","strength","form","SECTION_antecedent","SECTION_motif","SECTION_histoire","SECTION_examen_clinique","SECTION_examen_complementaire","SECTION_mode_de_vie","SECTION_traitement_entree","SECTION_antecedent_familiaux","SECTION_traitement_sortie","SECTION_conclusion"]

[components.qualifier]
factory = "eds.span_qualifier"

[components.qualifier.candidate_getter]
@misc = "eds.candidate_span_qualifier_getter"
on_ents = true
on_span_groups = false

qualifiers = ["_.Allergie", "_.Action","_.Certainty","_.Temporality","_.Negation","_.Family"]
label_constraints = {"_.Allergie":["Chemical_and_drugs"],"_.Action":["Chemical_and_drugs"],"_.Certainty":["Chemical_and_drugs"],"_.Temporality":["Chemical_and_drugs"],"_.Negation":["Chemical_and_drugs"],"_.Family":["Chemical_and_drugs"]}

[components.qualifier.model]
@architectures = "eds.span_multi_classifier.v1"
projection_mode = "dot"
pooler_mode = "max"

[components.qualifier.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[components.qualifier.model.tok2vec.embed]
@architectures = "spacy-transformers.TransformerListener.v1"
pooling = {"@layers":"reduce_mean.v1"}

[components.qualifier.model.tok2vec.encode]
@architectures = "spacy.MishWindowEncoder.v2"
width = 768
window_size = 1
depth = 2

[components.clean-entities]
factory = "clean-entities"

[components.ner.model]
@architectures = "eds.stack_crf_ner_model.v1"
mode = "joint"

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[components.ner.model.tok2vec.embed]
@architectures = "spacy-transformers.TransformerListener.v1"
pooling = {"@layers":"reduce_mean.v1"}

[components.ner.model.tok2vec.encode]
@architectures = "spacy.MishWindowEncoder.v2"
width = 768
window_size = 1
depth = 2

[components.tok2vec]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}

[components.tok2vec.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = ${paths.bert}
# name = "camembert-base"
mixed_precision = true

[components.tok2vec.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 64

[components.tok2vec.model.grad_scaler_config]

[components.tok2vec.model.tokenizer_config]
use_fast = true

[components.tok2vec.model.transformer_config]

[corpora]

[corpora.train]
@readers = "eds.Corpus.v1"
path = ${paths.train}
max_length = 384
gold_preproc = false
limit = 0
augmenter = null
seed = ${system.seed}
shuffle = true
filter_expr = null
#filter_expr = "any(ent._.event_type is not None for ent in doc.ents)"

[corpora.dev]
@readers = "eds.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
accumulate_gradient = 1
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
patience = 1500
max_epochs = -1
max_steps = 20000
eval_frequency = 100
frozen_components = []
annotating_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
size = 3072
discard_oversize = true
buffer = 256

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = true
#@loggers = "DVCLive.v1"

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 0.0
use_averages = false
eps = 0.000001

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = ${training.max_steps}
initial_rate = 0.0001

[training.score_weights]
ents_f = 0.8
ents_p = 0.0
ents_r = 0.0
qual_f = 0.2
speed = 0.0
ents_per_type = null
qual_per_type = null

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]
Loading