diff --git a/scripts/sequence_labeling/data.py b/scripts/sequence_labeling/data.py index f160f607da..89043e1182 100644 --- a/scripts/sequence_labeling/data.py +++ b/scripts/sequence_labeling/data.py @@ -131,7 +131,7 @@ def remove_docstart_sentence(sentences): return ret -def bert_tokenize_sentence(sentence, bert_tokenizer): +def bert_tokenize_sentence(sentence, bert_tokenizer, tagging_first_token): """Apply BERT tokenizer on a tagged sentence to break words into sub-words. This function assumes input tags are following IOBES, and outputs IOBES tags. @@ -141,6 +141,9 @@ def bert_tokenize_sentence(sentence, bert_tokenizer): List of tagged words bert_tokenizer: nlp.data.BertTokenizer BERT tokenizer + tagging_first_token: bool, optional (default: True) + By default, only the first token of a word is going to be tagged. + If ``tagging_first_token`` is set to False, then the last token of a word is going to be tagged. Returns ------- @@ -151,14 +154,20 @@ def bert_tokenize_sentence(sentence, bert_tokenizer): # break a word into sub-word tokens sub_token_texts = bert_tokenizer(token.text) # only the first token of a word is going to be tagged - ret.append(TaggedToken(text=sub_token_texts[0], tag=token.tag)) - ret += [TaggedToken(text=sub_token_text, tag=NULL_TAG) - for sub_token_text in sub_token_texts[1:]] + if tagging_first_token: + ret.append(TaggedToken(text=sub_token_texts[0], tag=token.tag)) + ret += [TaggedToken(text=sub_token_text, tag=NULL_TAG) + for sub_token_text in sub_token_texts[1:]] + # only the last token of a word is going to be tagged + else: + ret += [TaggedToken(text=sub_token_text, tag=NULL_TAG) + for sub_token_text in sub_token_texts[:-1]] + ret.append(TaggedToken(text=sub_token_texts[-1], tag=token.tag)) return ret -def load_segment(file_path, bert_tokenizer): +def load_segment(file_path, bert_tokenizer, tagging_first_token): """Load CoNLL format NER datafile with BIO-scheme tags. Tagging scheme is converted into BIOES, and words are tokenized into wordpieces @@ -169,6 +178,10 @@ def load_segment(file_path, bert_tokenizer): file_path: str Path of the file bert_tokenizer: nlp.data.BERTTokenizer + tagging_first_token: bool, optional (default: True) + By default, only the first token of a word is going to be tagged. + If ``tagging_first_token`` is set to False, then the last token of a word is going to be tagged. + Returns ------- @@ -177,7 +190,7 @@ def load_segment(file_path, bert_tokenizer): logging.info('Loading sentences in %s...', file_path) bio2_sentences = remove_docstart_sentence(read_bio_as_bio2(file_path)) bioes_sentences = [bio_bioes(sentence) for sentence in bio2_sentences] - subword_sentences = [bert_tokenize_sentence(sentence, bert_tokenizer) + subword_sentences = [bert_tokenize_sentence(sentence, bert_tokenizer, tagging_first_token) for sentence in bioes_sentences] logging.info('load %s, its max seq len: %d', @@ -203,19 +216,23 @@ class BERTTaggingDataset: Length of the input sequence to BERT. is_cased: bool Whether to use cased model. + tagging_first_token: bool, optional (default: True) + By default, only the first token of a word is going to be tagged. + If ``tagging_first_token`` is set to False, then the last token of a word is going to be tagged. + """ def __init__(self, text_vocab, train_path, dev_path, test_path, seq_len, is_cased, - tag_vocab=None): + tag_vocab=None, tagging_first_token=True): self.text_vocab = text_vocab self.seq_len = seq_len self.bert_tokenizer = nlp.data.BERTTokenizer(vocab=text_vocab, lower=not is_cased) train_sentences = [] if train_path is None else load_segment(train_path, - self.bert_tokenizer) - dev_sentences = [] if dev_path is None else load_segment(dev_path, self.bert_tokenizer) - test_sentences = [] if test_path is None else load_segment(test_path, self.bert_tokenizer) + self.bert_tokenizer, tagging_first_token) + dev_sentences = [] if dev_path is None else load_segment(dev_path, self.bert_tokenizer, tagging_first_token) + test_sentences = [] if test_path is None else load_segment(test_path, self.bert_tokenizer, tagging_first_token) all_sentences = train_sentences + dev_sentences + test_sentences if tag_vocab is None: @@ -318,7 +335,7 @@ def num_tag_types(self): def convert_arrays_to_text(text_vocab, tag_vocab, - np_text_ids, np_true_tags, np_pred_tags, np_valid_length): + np_text_ids, np_true_tags, np_pred_tags, np_valid_length, tagging_first_token=True): """Convert numpy array data into text Parameters @@ -327,6 +344,10 @@ def convert_arrays_to_text(text_vocab, tag_vocab, np_true_tags: tag_ids (batch_size, seq_len) np_pred_tags: tag_ids (batch_size, seq_len) np.array: valid_length (batch_size,) the number of tokens until [SEP] token + tagging_first_token: bool, optional (default: True) + By default, only the first token of a word is going to be tagged. + If ``tagging_first_token`` is set to False, then the last token of a word is going to be tagged. + Returns ------- @@ -337,19 +358,32 @@ def convert_arrays_to_text(text_vocab, tag_vocab, for sample_index in range(np_valid_length.shape[0]): sample_len = np_valid_length[sample_index] entries = [] + tmptext = "" for i in range(1, sample_len - 1): token_text = text_vocab.idx_to_token[np_text_ids[sample_index, i]] true_tag = tag_vocab.idx_to_token[int(np_true_tags[sample_index, i])] pred_tag = tag_vocab.idx_to_token[int(np_pred_tags[sample_index, i])] # we don't need to predict on NULL tags - if true_tag == NULL_TAG: - last_entry = entries[-1] - entries[-1] = PredictedToken(text=last_entry.text + token_text, - true_tag=last_entry.true_tag, - pred_tag=last_entry.pred_tag) + if tagging_first_token: + if true_tag == NULL_TAG: + last_entry = entries[-1] + entries[-1] = PredictedToken(text=last_entry.text + token_text, + true_tag=last_entry.true_tag, + pred_tag=last_entry.pred_tag) + else: + entries.append(PredictedToken(text=token_text, + true_tag=true_tag, pred_tag=pred_tag)) else: - entries.append(PredictedToken(text=token_text, - true_tag=true_tag, pred_tag=pred_tag)) - + if true_tag == NULL_TAG: + tmptext += token_text + else: + if len(tmptext) > 0: + text = tmptext + token_text + entries.append(PredictedToken(text=text, + true_tag=true_tag, pred_tag=pred_tag)) + tmptext = '' + else: + entries.append(PredictedToken(text=token_text, + true_tag=true_tag, pred_tag=pred_tag)) predictions.append(entries) return predictions diff --git a/scripts/sequence_labeling/finetune_bert.py b/scripts/sequence_labeling/finetune_bert.py index a0943fd05c..126f057640 100644 --- a/scripts/sequence_labeling/finetune_bert.py +++ b/scripts/sequence_labeling/finetune_bert.py @@ -77,6 +77,8 @@ def parse_args(): help='Learning rate for optimization') arg_parser.add_argument('--warmup-ratio', type=float, default=0.1, help='Warmup ratio for learning rate scheduling') + arg_parser.add_argument('--tagging-first-token', type=str2bool, default=True, + help='Choose to use the first or last piece of the word') args = arg_parser.parse_args() return args @@ -95,7 +97,7 @@ def main(config): config.dropout_prob) dataset = BERTTaggingDataset(text_vocab, config.train_path, config.dev_path, config.test_path, - config.seq_len, config.cased) + config.seq_len, config.cased,tagging_first_token=config.tagging_first_token) train_data_loader = dataset.get_train_data_loader(config.batch_size) dev_data_loader = dataset.get_dev_data_loader(config.batch_size) @@ -178,7 +180,8 @@ def evaluate(data_loader): np_true_tags = tag_ids.asnumpy() predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids, - np_true_tags, np_pred_tags, np_valid_length) + np_true_tags, np_pred_tags, np_valid_length, + tagging_first_token=config.tagging_first_token) all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions] all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions] diff --git a/scripts/sequence_labeling/predict_ner.py b/scripts/sequence_labeling/predict_ner.py index abdc3ec535..4770b52cbe 100644 --- a/scripts/sequence_labeling/predict_ner.py +++ b/scripts/sequence_labeling/predict_ner.py @@ -22,7 +22,7 @@ import os import mxnet as mx -from ner_utils import get_bert_model, get_context +from ner_utils import get_bert_model, get_context,str2bool from ner_utils import load_metadata from data import BERTTaggingDataset, convert_arrays_to_text from model import BERTTagger @@ -68,6 +68,10 @@ def parse_args(): help='Number (index) of GPU to run on, e.g. 0. ' 'If not specified, CPU context is used.') arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training') + arg_parser.add_argument('--tagging-first-token', type=str2bool, default=True, + help='Choose to tag first word piece or the last word piece') + + args = arg_parser.parse_args() return args @@ -81,7 +85,7 @@ def main(config): train_config.dropout_prob) dataset = BERTTaggingDataset(text_vocab, None, None, config.test_path, - config.seq_len, train_config.cased, tag_vocab=tag_vocab) + config.seq_len, train_config.cased, tag_vocab=tag_vocab,tagging_first_token=config.tagging_first_token) test_data_loader = dataset.get_test_data_loader(config.batch_size) @@ -112,7 +116,7 @@ def evaluate(data_loader): np_true_tags = tag_ids.asnumpy() predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids, - np_true_tags, np_pred_tags, np_valid_length) + np_true_tags, np_pred_tags, np_valid_length,tagging_first_token=config.tagging_first_token) all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions] all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions]