Sequence Labeling Processor

import json
import logging
from typing import List, Union, Any, Optional

from tqdm import tqdm
from .base_processor import (
    EDDataProcessor,
    EDInputExample,
    EDInputFeatures,
    EAEDataProcessor,
    EAEInputExample,
    EAEInputFeatures
)

logger = logging.getLogger(__name__)

EDSLProcessor

Data processor for sequence labeling for event detection. The class is inherited from the EDDataProcessor class, in which the undefined functions, including read_examples() and convert_examples_to_features() are implemented; a new function entitled get_final_labels() is defined to obtain final results, and the rest of the attributes and functions are multiplexed from the EDDataProcessor class.

class EDSLProcessor(EDDataProcessor):
    """Data processor for sequence labeling for event detection.
    Data processor for sequence labeling for event detection. The class is inherited from the `EDDataProcessor` class,
    in which the undefined functions, including `read_examples()` and `convert_examples_to_features()` are  implemented;
    a new function entitled `get_final_labels()` is defined to obtain final results, and the rest of the attributes and
    functions are multiplexed from the `EDDataProcessor` class.
    Attributes:
        is_overflow:
    """

    def __init__(self,
                 config,
                 tokenizer: str,
                 input_file: str) -> None:
        """Constructs a EDSLProcessor."""
        super().__init__(config, tokenizer)
        self.read_examples(input_file)
        self.is_overflow = []
        self.convert_examples_to_features()

    def read_examples(self,
                      input_file: str) -> None:
        """Obtains a collection of `EDInputExample`s for the dataset."""
        self.examples = []
        language = self.config.language

        with open(input_file, "r", encoding="utf-8") as f:
            for line in tqdm(f.readlines(), desc="Reading from %s" % input_file):
                item = json.loads(line.strip())
                text = item["text"]
                words = get_words(text=text, language=language)
                labels = ["O"] * len(words)

                if "events" in item:
                    for event in item["events"]:
                        for trigger in event["triggers"]:
                            left_pos, right_pos = get_left_and_right_pos(text, trigger, language, True)
                            labels[left_pos] = f"B-{event['type']}"
                            for i in range(left_pos + 1, right_pos):
                                labels[i] = f"I-{event['type']}"

                example = EDInputExample(example_id=item["id"], text=words, labels=labels)
                self.examples.append(example)

    def get_final_labels(self,
                         example: EDInputExample,
                         word_ids_of_each_token: List[int],
                         label_all_tokens: Optional[bool] = False) -> List[Union[str, int]]:
        """Obtains the final label of each token."""
        final_labels = []
        pre_word_id = None
        for word_id in word_ids_of_each_token:
            if word_id is None:
                final_labels.append(-100)
            elif word_id != pre_word_id:  # first split token of a word
                final_labels.append(self.config.type2id[example.labels[word_id]])
            else:
                final_labels.append(self.config.type2id[example.labels[word_id]] if label_all_tokens else -100)
            pre_word_id = word_id

        return final_labels

    def convert_examples_to_features(self) -> None:
        """Converts the `EDInputExample`s into `EDInputFeatures`s."""
        self.input_features = []

        for example in tqdm(self.examples, desc="Processing features for SL"):
            outputs = self.tokenizer(example.text,
                                     padding="max_length",
                                     truncation=False,
                                     max_length=self.config.max_seq_length,
                                     is_split_into_words=True)
            # Roberta tokenizer doesn't return token_type_ids
            if "token_type_ids" not in outputs:
                outputs["token_type_ids"] = [0] * len(outputs["input_ids"])

            outputs, is_overflow = self._truncate(outputs, self.config.max_seq_length)
            self.is_overflow.append(is_overflow)

            word_ids_of_each_token = get_word_ids(self.tokenizer, outputs, example.text)[: self.config.max_seq_length]
            final_labels = self.get_final_labels(example, word_ids_of_each_token, label_all_tokens=False)

            features = EDInputFeatures(
                example_id=example.example_id,
                input_ids=outputs["input_ids"],
                attention_mask=outputs["attention_mask"],
                token_type_ids=outputs["token_type_ids"],
                labels=final_labels,
            )
            self.input_features.append(features)

EAESLProcessor

Data processor for sequence labeling for event argument extraction. The class is inherited from the EAEDataProcessor class, in which the undefined functions, including read_examples() and convert_examples_to_features() are implemented; twp new functions, entitled get_final_labels() and insert_markers()` are defined, and the rest of the attributes and functions are multiplexed from the EAEDataProcessor class.

Attributes:

  • positive_candidate_indices: A list of integers indicating the indices of positive trigger candidates.

class EAESLProcessor(EAEDataProcessor):
    """Data processor for sequence labeling for event argument extraction.
    Data processor for sequence labeling for event argument extraction. The class is inherited from the
    `EAEDataProcessor` class, in which the undefined functions, including `read_examples()` and
    `convert_examples_to_features()` are  implemented; twp new functions, entitled `get_final_labels()` and
    `insert_markers()` are defined, and the rest of the attributes and functions are multiplexed from the
    `EAEDataProcessor` class.
    Attributes:
        positive_candidate_indices (`List[int]`):
            A list of integers indicating the indices of positive trigger candidates.
        is_overflow:
    """

    def __init__(self,
                 config: str,
                 tokenizer: str,
                 input_file: str,
                 pred_file: str,
                 is_training: Optional[bool] = False) -> None:
        """Constructs an EAESLProcessor/"""
        super().__init__(config, tokenizer, pred_file, is_training)
        self.positive_candidate_indices = []
        self.is_overflow = []
        self.config.role2id["X"] = -100
        self.read_examples(input_file)
        self.convert_examples_to_features()

    def read_examples(self,
                      input_file: str) -> None:
        """Obtains a collection of `EAEInputExample`s for the dataset."""
        self.examples = []
        language = self.config.language
        trigger_idx = 0
        with open(input_file, "r", encoding="utf-8") as f:
            for line in tqdm(f.readlines(), desc="Reading from %s" % input_file):
                item = json.loads(line.strip())
                text = item["text"]
                words = get_words(text=text, language=language)

                if "events" in item:
                    for event in item["events"]:
                        for trigger in event["triggers"]:
                            pred_type = self.get_single_pred(trigger_idx, input_file, true_type=event["type"])
                            trigger_idx += 1

                            # Evaluation mode for EAE
                            # If the predicted event type is NA, We don't consider the trigger
                            if self.config.eae_eval_mode in ["default", "loose"] and pred_type == "NA":
                                continue
                            trigger_left, trigger_right = get_left_and_right_pos(text, trigger, language, True)
                            labels = ["O"] * len(words)

                            for argument in trigger["arguments"]:
                                for mention in argument["mentions"]:
                                    left_pos, right_pos = get_left_and_right_pos(text, mention, language, True)
                                    labels[left_pos] = f"B-{argument['role']}"
                                    for i in range(left_pos + 1, right_pos):
                                        labels[i] = f"I-{argument['role']}"

                            example = EAEInputExample(
                                example_id=item["id"],
                                text=words,
                                pred_type=pred_type,
                                true_type=event["type"],
                                trigger_left=trigger_left,
                                trigger_right=trigger_right,
                                labels=labels,
                            )
                            self.examples.append(example)

                    # negative triggers
                    for neg in item["negative_triggers"]:
                        pred_type = self.get_single_pred(trigger_idx, input_file, true_type="NA")
                        trigger_idx += 1
                        if self.config.eae_eval_mode == "loose":
                            continue
                        elif self.config.eae_eval_mode in ["default", "strict"]:
                            if pred_type != "NA":
                                neg_left, neg_right = get_left_and_right_pos(text, neg, language, True)
                                example = EAEInputExample(
                                    example_id=item["id"],
                                    text=words,
                                    pred_type=pred_type,
                                    true_type="NA",
                                    trigger_left=neg_left,
                                    trigger_right=neg_right,
                                    labels=["O"] * len(words),
                                )
                                self.examples.append(example)
                        else:
                            raise ValueError("Invalid eac_eval_mode: %s" % self.config.eae_eval_mode)
                else:
                    for can in item["candidates"]:
                        can_left, can_right = get_left_and_right_pos(text, can, language, True)
                        labels = ["O"] * len(words)
                        pred_type = self.event_preds[trigger_idx]
                        trigger_idx += 1
                        if pred_type != "NA":
                            example = EAEInputExample(
                                example_id=item["id"],
                                text=words,
                                pred_type=pred_type,
                                true_type="NA",  # true type not given, set to NA.
                                trigger_left=can_left,
                                trigger_right=can_right,
                                labels=labels,
                            )
                            self.examples.append(example)
                            self.positive_candidate_indices.append(trigger_idx-1)
            if self.event_preds is not None:
                assert trigger_idx == len(self.event_preds)

    def get_final_labels(self,
                         labels: dict,
                         word_ids_of_each_token: List[Any],
                         label_all_tokens: bool = False) -> List[Union[str, int]]:
        """Obtains the final label of each token."""
        final_labels = []
        pre_word_id = None
        for word_id in word_ids_of_each_token:
            if word_id is None:
                final_labels.append(-100)
            elif word_id != pre_word_id:  # first split token of a word
                final_labels.append(self.config.role2id[labels[word_id]])
            else:
                final_labels.append(self.config.role2id[labels[word_id]] if label_all_tokens else -100)
            pre_word_id = word_id

        return final_labels

    @staticmethod
    def insert_marker(text: list,
                      event_type: str,
                      labels,
                      trigger_pos: List[int],
                      markers):
        """Adds a marker at the start and end position of event triggers and argument mentions."""
        left, right = trigger_pos

        marked_text = text[:left] + [markers[event_type][0]] + text[left:right] + [markers[event_type][1]] + text[right:]
        marked_labels = labels[:left] + ["X"] + labels[left:right] + ["X"] + labels[right:]

        assert len(marked_text) == len(marked_labels)
        return marked_text, marked_labels

    def convert_examples_to_features(self) -> None:
        """Converts the `EAEInputExample`s into `EAEInputFeatures`s."""
        self.input_features = []
        self.is_overflow = []

        for example in tqdm(self.examples, desc="Processing features for SL"):
            text, labels = self.insert_marker(example.text,
                                              example.pred_type,
                                              example.labels,
                                              [example.trigger_left, example.trigger_right],
                                              self.config.markers)
            outputs = self.tokenizer(text,
                                     padding="max_length",
                                     truncation=False,
                                     max_length=self.config.max_seq_length,
                                     is_split_into_words=True)
            # Roberta tokenizer doesn't return token_type_ids
            if "token_type_ids" not in outputs:
                outputs["token_type_ids"] = [0] * len(outputs["input_ids"])
            outputs, is_overflow = self._truncate(outputs, self.config.max_seq_length)
            self.is_overflow.append(is_overflow)

            word_ids_of_each_token = get_word_ids(self.tokenizer, outputs, example.text)[: self.config.max_seq_length]
            final_labels = self.get_final_labels(labels, word_ids_of_each_token, label_all_tokens=False)

            features = EAEInputFeatures(
                example_id=example.example_id,
                input_ids=outputs["input_ids"],
                attention_mask=outputs["attention_mask"],
                token_type_ids=outputs["token_type_ids"],
                labels=final_labels,
            )
            self.input_features.append(features)