Base Processor
import os
import json
import torch
import logging
from torch.utils.data import Dataset
from typing import Dict, List, Optional, Union
logger = logging.getLogger(__name__)
EDInputExample
A single training/test example for event detection, representing the basic information of an event trigger, including its example id, the source text it is within, its start and end position, and the event type of the trigger.
Attributes:
example_id
: A string or an integer for the unique id of the example.text
: A string representing the source text the event trigger is within.trigger_left
: An integer indicating the left position of the event trigger.trigger_right
: An integer indicating the right position of the event trigger.labels
: A string indicating the event type of the trigger.
class EDInputExample(object):
"""A single training/test example for event detection.
A single training/test example for event detection, representing the basic information of an event trigger,
including its example id, the source text it is within, its start and end position, and the label of the event.
Attributes:
example_id (`Union[int, str]`):
A string or an integer for the unique id of the example.
text (`str`):
A string representing the source text the event trigger is within.
trigger_left (`int`, `optional`, defaults to `None`):
An integer indicating the left position of the event trigger.
trigger_right (`int`, `optional`, defaults to `None`):
An integer indicating the right position of the event trigger.
labels (`int`, `optional`, defaults to `None`):
A string indicating the event type of the trigger.
"""
def __init__(self,
example_id: Union[int, str],
text: str,
trigger_left: Optional[int] = None,
trigger_right: Optional[int] = None,
labels: Optional[str] = None) -> None:
"""Constructs an `EDInputExample`."""
self.example_id = example_id
self.text = text
self.trigger_left = trigger_left
self.trigger_right = trigger_right
self.labels = labels
EDInputFeatures
Input features of an instance for event detection, representing the basic features of an event trigger, including its example id, the indices of tokens in the vocabulary, attention masks, segment token indices, start and end position, and the event type of the trigger.
Attributes:
example_id
: A string or an integer for the unique id of the example.input_ids
: A list of integers representing the indices of input sequence tokens in the vocabulary.attention_mask
: A list of integers (in 0/1) for masks to avoid attention on padding tokens.token_type_ids
: A list of integers indicating the first and second portions of the inputs.trigger_left
: An integer indicating the left position of the event trigger.trigger_right
: An integer indicating the right position of the event trigger.labels
: A string indicating the event type of the trigger.
class EDInputFeatures(object):
"""Input features of an instance for event detection.
Input features of an instance for event detection, representing the basic features of an event trigger, including
its example id, the indices of tokens in the vocabulary, attention masks, segment token indices, start and end
position, and the label of the event.
Attributes:
example_id (`Union[int, str]`):
A string or an integer for the unique id of the example.
input_ids (`List[int]`):
A list of integers representing the indices of input sequence tokens in the vocabulary.
attention_mask (`List[int]`):
A list of integers (in 0/1) for masks to avoid attention on padding tokens.
token_type_ids (`List[int]`, `optional`, defaults to `None`):
A list of integers indicating the first and second portions of the inputs.
trigger_left (`int`, `optional`, defaults to `None`):
An integer indicating the left position of the event trigger.
trigger_right (`int`, `optional`, defaults to `None`):
An integer indicating the right position of the event trigger.
labels (`str`, `optional`, defaults to `None`):
A string indicating the event type of the trigger.
"""
def __init__(self,
example_id: Union[int, str],
input_ids: List[int],
attention_mask: List[int],
token_type_ids: Optional[List[int]] = None,
trigger_left: Optional[int] = None,
trigger_right: Optional[int] = None,
labels: Optional[str] = None) -> None:
"""Constructs an `EDInputFeatures`."""
self.example_id = example_id
self.input_ids = input_ids
self.attention_mask = attention_mask
self.token_type_ids = token_type_ids
self.trigger_left = trigger_left
self.trigger_right = trigger_right
self.labels = labels
EAEInputExample
A single training/test example for event argument extraction, representing the basic information of an event trigger, including its example id, the source text it is within, the predicted and actual event type, the input template for the Machine Reading Comprehension (MRC) paradigm, the start and end position of the event trigger and argument, and the label of the event.
Attributes:
example_id
: A string or an integer for the unique id of the example.text
: A string representing the source text the event trigger and argument is within.pred_type
: A string indicating the event type predicted by the model.true_type
: A string indicating the actual event type from the annotation.input_template
: The input template for the MRC paradigm.trigger_left
: An integer indicating the left position of the event trigger.trigger_right
: An integer indicating the right position of the event trigger.argument_left
: An integer indicating the left position of the argument mention.argument_right
: An integer indicating the right position of the argument mention.argument_role
: A string indicating the argument role of the argument mention.labels
: A string indicating the label of the event.
class EAEInputExample(object):
"""A single training/test example for event argument extraction.
A single training/test example for event argument extraction, representing the basic information of an event
trigger, including its example id, the source text it is within, the predicted and actual event type, the input
template for the Machine Reading Comprehension (MRC) paradigm, the start and end position of the event trigger and
argument, and the label of the event.
Attributes:
example_id (`Union[int, str]`):
A string or an integer for the unique id of the example.
text (`str`):
A string representing the source text the event trigger and argument is within.
pred_type (`str`):
A string indicating the event type predicted by the model.
true_type (`str`):
A string indicating the actual event type from the annotation.
input_template:
The input template for the MRC paradigm.
trigger_left (`int`, `optional`, defaults to `None`):
An integer indicating the left position of the event trigger.
trigger_right (`int`, `optional`, defaults to `None`):
An integer indicating the right position of the event trigger.
argument_left (`int`, `optional`, defaults to `None`):
An integer indicating the left position of the argument mention.
argument_right (`int`, `optional`, defaults to `None`):
An integer indicating the right position of the argument mention.
argument_role (`str`, `optional`, defaults to `None`):
A string indicating the argument role of the argument mention.
labels (`str`, `optional`, defaults to `None`):
A string indicating the label of the event.
"""
def __init__(self,
example_id: Union[int, str],
text: str,
pred_type: str,
true_type: str,
input_template: Optional = None,
trigger_left: Optional[int] = None,
trigger_right: Optional[int] = None,
argument_left: Optional[int] = None,
argument_right: Optional[int] = None,
argument_role: Optional[str] = None,
labels: Optional[str] = None):
"""Constructs a `EAEInputExample`."""
self.example_id = example_id
self.text = text
self.pred_type = pred_type
self.true_type = true_type
self.input_template = input_template
self.trigger_left = trigger_left
self.trigger_right = trigger_right
self.argument_left = argument_left
self.argument_right = argument_right
self.argument_role = argument_role
self.labels = labels
EAEInputFeatures
Input features of an instance for event argument extraction, representing the basic features of an argument mention, including its example id, the indices of tokens in the vocabulary, the attention mask, segment token indices, the start and end position of the event trigger and argument mention, and the event type of the trigger.
Attributes:
example_id
: A string or an integer for the unique id of the example.input_ids
: A list of integers representing the indices of input sequence tokens in the vocabulary.attention_mask
: A list of integers (in 0/1) for masks to avoid attention on padding tokens.token_type_ids
: A list of integers indicating the first and second portions of the inputs.trigger_left
: An integer for the left position of the event trigger.trigger_right
: An integer for the right position of the event trigger.argument_left
: An integer for the left position of the argument mention.argument_right
: An integer for the right position of the argument mention.labels
: A string indicating the event type of the trigger.
class EAEInputFeatures(object):
"""Input features of an instance for event argument extraction.
Input features of an instance for event argument extraction, representing the basic features of an argument mention,
including its example id, the indices of tokens in the vocabulary, the attention mask, segment token indices, the
start and end position of the event trigger and argument mention, and the label of the event.
Attributes:
example_id (`Union[int, str]`):
A string or an integer for the unique id of the example.
input_ids (`List[int]`):
A list of integers representing the indices of input sequence tokens in the vocabulary.
attention_mask (`List[int]`):
A list of integers (in 0/1) for masks to avoid attention on padding tokens.
token_type_ids (`List[int]`, `optional`, defaults to `None`):
A list of integers indicating the first and second portions of the inputs.
trigger_left (`int`, `optional`, defaults to `None`):
An integer for the left position of the event trigger.
trigger_right (`int`, `optional`, defaults to `None`):
An integer for the right position of the event trigger.
argument_left (`int`, `optional`, defaults to `None`):
An integer for the left position of the argument mention.
argument_right (`int`, `optional`, defaults to `None`):
An integer for the right position of the argument mention.
labels (`str`, `optional`, defaults to `None`):
A string indicating the event type of the trigger.
"""
def __init__(self,
example_id: Union[int, str],
input_ids: List[int],
attention_mask: List[int],
token_type_ids: Optional[List[int]] = None,
trigger_left: Optional[int] = None,
trigger_right: Optional[int] = None,
argument_left: Optional[int] = None,
argument_right: Optional[int] = None,
labels: Optional[str] = None) -> None:
"""Constructs an `EAEInputFeatures`."""
self.example_id = example_id
self.input_ids = input_ids
self.attention_mask = attention_mask
self.token_type_ids = token_type_ids
self.trigger_left = trigger_left
self.trigger_right = trigger_right
self.argument_left = argument_left
self.argument_right = argument_right
self.labels = labels
EDDataProcessor
The base class of data processor for event detection, which would be inherited to construct task-specific data processors.
Attributes:
config
: The pre-defined configurations of the execution.tokenizer
: The tokenizer method proposed for the tokenization process.examples
: A list of ``EDInputExample``s constructed based on the input dataset.input_features
: A list of ``EDInputFeatures``s corresponding to the ``EDInputExample``s.
class EDDataProcessor(Dataset):
"""Base class of data processor for event detection.
The base class of data processor for event detection, which would be inherited to construct task-specific data
processors.
Attributes:
config:
The pre-defined configurations of the execution.
tokenizer (`str`):
The tokenizer method proposed for the tokenization process.
examples (`List[EDInputExample]`):
A list of `EDInputExample`s constructed based on the input dataset.
input_features (`List[EDInputFeatures]`):
A list of `EDInputFeatures`s corresponding to the `EDInputExample`s.
"""
def __init__(self,
config,
tokenizer) -> None:
"""Constructs an `EDDataProcessor`."""
self.config = config
self.tokenizer = tokenizer
self.examples = []
self.input_features = []
def read_examples(self,
input_file: str):
"""Obtains a collection of `EDInputExample`s for the dataset."""
raise NotImplementedError
def convert_examples_to_features(self):
"""Converts the `EDInputExample`s into `EDInputFeatures`s."""
raise NotImplementedError
def _truncate(self,
outputs: dict,
max_seq_length: int):
"""Truncates the sequence that exceeds the maximum length."""
is_truncation = False
if len(outputs["input_ids"]) > max_seq_length:
print("An instance exceeds the maximum length.")
is_truncation = True
for key in ["input_ids", "attention_mask", "token_type_ids", "offset_mapping"]:
if key not in outputs:
continue
outputs[key] = outputs[key][:max_seq_length]
return outputs, is_truncation
def get_ids(self) -> List[Union[int, str]]:
"""Returns the id of the examples."""
ids = []
for example in self.examples:
ids.append(example.example_id)
return ids
def __len__(self) -> int:
"""Returns the length of the examples."""
return len(self.input_features)
def __getitem__(self,
index: int) -> Dict[str, torch.Tensor]:
"""Obtains the features of a given example index and converts them into a dictionary."""
features = self.input_features[index]
data_dict = dict(
input_ids=torch.tensor(features.input_ids, dtype=torch.long),
attention_mask=torch.tensor(features.attention_mask, dtype=torch.float32)
)
if features.token_type_ids is not None and self.config.return_token_type_ids:
data_dict["token_type_ids"] = torch.tensor(features.token_type_ids, dtype=torch.long)
if features.trigger_left is not None:
data_dict["trigger_left"] = torch.tensor(features.trigger_left, dtype=torch.float32)
if features.trigger_right is not None:
data_dict["trigger_right"] = torch.tensor(features.trigger_right, dtype=torch.float32)
if features.labels is not None:
data_dict["labels"] = torch.tensor(features.labels, dtype=torch.long)
return data_dict
def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
"""Collates the samples in batches."""
output_batch = dict()
for key in batch[0].keys():
output_batch[key] = torch.stack([x[key] for x in batch], dim=0)
if self.config.truncate_in_batch:
input_length = int(output_batch["attention_mask"].sum(-1).max())
for key in ["input_ids", "attention_mask", "token_type_ids"]:
if key not in output_batch:
continue
output_batch[key] = output_batch[key][:, :input_length]
if "labels" in output_batch and len(output_batch["labels"].shape) == 2:
if self.config.truncate_seq2seq_output:
output_length = int((output_batch["labels"] != -100).sum(-1).max())
output_batch["labels"] = output_batch["labels"][:, :output_length]
else:
output_batch["labels"] = output_batch["labels"][:, :input_length]
return output_batch
EAEDataProcessor
The base class of data processor for event argument extraction, which would be inherited to construct task-specific data processors.
Attributes:
config
: The pre-defined configurations of the execution.tokenizer
: The tokenizer method proposed for the tokenization process.is_training
: A boolean variable indicating the state is training or not.examples
: A list of ``EDInputExample``s constructed based on the input dataset.input_features
: A list of ``EAEInputFeatures``s corresponding to the ``EAEInputExample``s.data_for_evaluation
: A dictionary representing the evaluation data.event_preds
: A list of event prediction data if the file exists.
class EAEDataProcessor(Dataset):
"""Base class of data processor for event argument extraction.
The base class of data processor for event argument extraction, which would be inherited to construct task-specific
data processors.
Attributes:
config:
The pre-defined configurations of the execution.
tokenizer:
The tokenizer method proposed for the tokenization process.
is_training (`bool`):
A boolean variable indicating the state is training or not.
examples (`List[EDInputExample]`):
A list of `EDInputExample`s constructed based on the input dataset.
input_features (`List[EAEInputFeatures]`):
A list of `EAEInputFeatures`s corresponding to the `EAEInputExample`s.
data_for_evaluation (`dict`):
A dictionary representing the evaluation data.
event_preds (`list`):
A list of event prediction data if the file exists.
"""
def __init__(self,
config,
tokenizer,
pred_file: str,
is_training: bool) -> None:
"""Constructs a EAEDataProcessor."""
self.config = config
self.tokenizer = tokenizer
self.is_training = is_training
if hasattr(config, "role2id"):
self.config.role2id["X"] = -100
self.examples = []
self.input_features = []
# data for trainer evaluation
self.data_for_evaluation = {}
# event prediction file path
if pred_file is not None:
if not os.path.exists(pred_file):
logger.warning("%s doesn't exist.We use golden triggers" % pred_file)
self.event_preds = None
else:
self.event_preds = json.load(open(pred_file))
else:
logger.warning("Event predictions is none! We use golden triggers.")
self.event_preds = None
def read_examples(self,
input_file: str):
"""Obtains a collection of `EAEInputExample`s for the dataset."""
raise NotImplementedError
def convert_examples_to_features(self):
"""Converts the `EAEInputExample`s into `EAEInputFeatures`s."""
raise NotImplementedError
def get_data_for_evaluation(self) -> Dict[str, Union[int, str]]:
"""Obtains the data for evaluation."""
self.data_for_evaluation["pred_types"] = self.get_pred_types()
self.data_for_evaluation["true_types"] = self.get_true_types()
self.data_for_evaluation["ids"] = self.get_ids()
if self.examples[0].argument_role is not None:
self.data_for_evaluation["roles"] = self.get_roles()
return self.data_for_evaluation
def get_pred_types(self) -> List[str]:
"""Obtains the event type predicted by the model."""
pred_types = []
for example in self.examples:
pred_types.append(example.pred_type)
return pred_types
def get_true_types(self) -> List[str]:
"""Obtains the actual event type from the annotation."""
true_types = []
for example in self.examples:
true_types.append(example.true_type)
return true_types
def get_roles(self) -> List[str]:
"""Obtains the role of each argument mention."""
roles = []
for example in self.examples:
roles.append(example.argument_role)
return roles
def _truncate(self,
outputs: Dict[str, List[int]],
max_seq_length: int):
"""Truncates the sequence that exceeds the maximum length."""
is_truncation = False
if len(outputs["input_ids"]) > max_seq_length:
print("An instance exceeds the maximum length.")
is_truncation = True
for key in ["input_ids", "attention_mask", "token_type_ids", "offset_mapping"]:
if key not in outputs:
continue
outputs[key] = outputs[key][:max_seq_length]
return outputs, is_truncation
def get_ids(self) -> List[Union[int, str]]:
"""Returns the id of the examples."""
ids = []
for example in self.examples:
ids.append(example.example_id)
return ids
def __len__(self) -> int:
"""Returns the length of the examples."""
return len(self.input_features)
def __getitem__(self,
index: int) -> Dict[str, torch.Tensor]:
"""Returns the features of a given example index in a dictionary."""
features = self.input_features[index]
data_dict = dict(
input_ids=torch.tensor(features.input_ids, dtype=torch.long),
attention_mask=torch.tensor(features.attention_mask, dtype=torch.float32)
)
if features.token_type_ids is not None and self.config.return_token_type_ids:
data_dict["token_type_ids"] = torch.tensor(features.token_type_ids, dtype=torch.long)
if features.trigger_left is not None:
data_dict["trigger_left"] = torch.tensor(features.trigger_left, dtype=torch.long)
if features.trigger_right is not None:
data_dict["trigger_right"] = torch.tensor(features.trigger_right, dtype=torch.long)
if features.argument_left is not None:
data_dict["argument_left"] = torch.tensor(features.argument_left, dtype=torch.long)
if features.argument_right is not None:
data_dict["argument_right"] = torch.tensor(features.argument_right, dtype=torch.long)
if features.labels is not None:
data_dict["labels"] = torch.tensor(features.labels, dtype=torch.long)
return data_dict
def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
"""Collates the samples in batches."""
output_batch = dict()
for key in batch[0].keys():
output_batch[key] = torch.stack([x[key] for x in batch], dim=0)
if self.config.truncate_in_batch:
input_length = int(output_batch["attention_mask"].sum(-1).max())
for key in ["input_ids", "attention_mask", "token_type_ids"]:
if key not in output_batch:
continue
output_batch[key] = output_batch[key][:, :input_length]
if "labels" in output_batch and len(output_batch["labels"].shape) == 2:
if self.config.truncate_seq2seq_output:
output_length = int((output_batch["labels"] != -100).sum(-1).max())
output_batch["labels"] = output_batch["labels"][:, :output_length]
else:
output_batch["labels"] = output_batch["labels"][:, :input_length]
return output_batch