Token Classification Processor
import json
import logging
from tqdm import tqdm
from typing import List, Optional, Dict
from .base_processor import (
EDDataProcessor,
EDInputExample,
EDInputFeatures,
EAEDataProcessor,
EAEInputExample,
EAEInputFeatures
)
logger = logging.getLogger(__name__)
EDTCProcessor
Data processor for token classification for event detection. The class is inherited from the`EDDataProcessor` class,
in which the undefined functions, including read_examples()
and convert_examples_to_features()
are implemented;
the rest of the attributes and functions are multiplexed from the EDDataProcessor
class.
class EDTCProcessor(EDDataProcessor):
"""Data processor for token classification for event detection.
Data processor for token classification for event detection. The class is inherited from the`EDDataProcessor` class,
in which the undefined functions, including `read_examples()` and `convert_examples_to_features()` are implemented;
the rest of the attributes and functions are multiplexed from the `EDDataProcessor` class.
"""
def __init__(self,
config,
tokenizer: str,
input_file: str) -> None:
"""Constructs an EDTCProcessor."""
super().__init__(config, tokenizer)
self.read_examples(input_file)
self.convert_examples_to_features()
def read_examples(self,
input_file: str) -> None:
"""Obtains a collection of `EDInputExample`s for the dataset."""
self.examples = []
with open(input_file, "r") as f:
for line in tqdm(f.readlines(), desc="Reading from %s" % input_file):
item = json.loads(line.strip())
# training and valid set
if "events" in item:
for event in item["events"]:
for trigger in event["triggers"]:
example = EDInputExample(
example_id=trigger["id"],
text=item["text"],
trigger_left=trigger["position"][0],
trigger_right=trigger["position"][1],
labels=event["type"]
)
self.examples.append(example)
if "negative_triggers" in item:
for neg in item["negative_triggers"]:
example = EDInputExample(
example_id=neg["id"],
text=item["text"],
trigger_left=neg["position"][0],
trigger_right=neg["position"][1],
labels="NA"
)
self.examples.append(example)
# test set
if "candidates" in item:
for candidate in item["candidates"]:
example = EDInputExample(
example_id=candidate["id"],
text=item["text"],
trigger_left=candidate["position"][0],
trigger_right=candidate["position"][1],
labels="NA",
)
# # if test set has labels
# assert not (self.config.test_exists_labels ^ ("type" in candidate))
# if "type" in candidate:
# example.labels = candidate["type"]
self.examples.append(example)
def convert_examples_to_features(self) -> None:
"""Converts the `EDInputExample`s into `EDInputFeatures`s."""
# merge and then tokenize
self.input_features = []
for example in tqdm(self.examples, desc="Processing features for TC"):
text_left = example.text[:example.trigger_left]
text_mid = example.text[example.trigger_left:example.trigger_right]
text_right = example.text[example.trigger_right:]
if self.config.language == "Chinese":
text = text_left + self.config.markers[0] + text_mid + self.config.markers[1] + text_right
else:
text = text_left + self.config.markers[0] + " " + text_mid + " " + self.config.markers[1] + text_right
outputs = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.config.max_seq_length)
is_overflow = False
try:
left = outputs["input_ids"].index(self.tokenizer.convert_tokens_to_ids(self.config.markers[0]))
right = outputs["input_ids"].index(self.tokenizer.convert_tokens_to_ids(self.config.markers[1]))
except:
logger.warning("Markers are not in the input tokens.")
left, right = 0, 0
is_overflow = True
# Roberta tokenizer doesn't return token_type_ids
if "token_type_ids" not in outputs:
outputs["token_type_ids"] = [0] * len(outputs["input_ids"])
features = EDInputFeatures(
example_id=example.example_id,
input_ids=outputs["input_ids"],
attention_mask=outputs["attention_mask"],
token_type_ids=outputs["token_type_ids"],
trigger_left=left,
trigger_right=right
)
if example.labels is not None:
features.labels = self.config.type2id[example.labels]
self.input_features.append(features)
EAETCProcessor
Data processor for token classification for event argument extraction. The class is inherited from the
EAEDataProcessor
class, in which the undefined functions, including read_examples()
and
convert_examples_to_features()
are implemented; a new function entitled insert_marker()
is defined, and
the rest of the attributes and functions are multiplexed from the EAEDataProcessor
class.
class EAETCProcessor(EAEDataProcessor):
"""Data processor for token classification for event argument extraction.
Data processor for token classification for event argument extraction. The class is inherited from the
`EAEDataProcessor` class, in which the undefined functions, including `read_examples()` and
`convert_examples_to_features()` are implemented; a new function entitled `insert_marker()` is defined, and
the rest of the attributes and functions are multiplexed from the `EAEDataProcessor` class.
"""
def __init__(self,
config,
tokenizer: str,
input_file: str,
pred_file: str,
is_training: Optional[bool] = False):
"""Constructs a `EAETCProcessor`."""
super().__init__(config, tokenizer, pred_file, is_training)
self.read_examples(input_file)
self.convert_examples_to_features()
def read_examples(self,
input_file: str) -> None:
"""Obtains a collection of `EAEInputExample`s for the dataset."""
self.examples = []
trigger_idx = 0
with open(input_file, "r") as f:
all_lines = f.readlines()
for line in tqdm(all_lines, desc="Reading from %s" % input_file):
item = json.loads(line.strip())
if "events" in item:
for event in item["events"]:
for trigger in event["triggers"]:
true_type = event["type"]
if self.is_training or self.config.golden_trigger or self.event_preds is None:
pred_type = true_type
else:
pred_type = self.event_preds[trigger_idx]
trigger_idx += 1
if self.config.eae_eval_mode in ['default', 'loose']:
if pred_type == "NA":
continue
args_for_trigger = set()
positive_offsets = []
for argument in trigger["arguments"]:
for mention in argument["mentions"]:
example = EAEInputExample(
example_id=trigger["id"],
text=item["text"],
pred_type=pred_type,
true_type=event["type"],
trigger_left=trigger["position"][0],
trigger_right=trigger["position"][1],
argument_left=mention["position"][0],
argument_right=mention["position"][1],
labels=argument["role"]
)
args_for_trigger.add(mention['mention_id'])
positive_offsets.append(mention["position"])
self.examples.append(example)
if "entities" in item:
for entity in item["entities"]:
# check whether the entity is an argument
is_argument = False
for mention in entity["mentions"]:
if mention["mention_id"] in args_for_trigger:
is_argument = True
break
if is_argument:
continue
# negative arguments
for mention in entity["mentions"]:
example = EAEInputExample(
example_id=trigger["id"],
text=item["text"],
pred_type=pred_type,
true_type=event["type"],
trigger_left=trigger["position"][0],
trigger_right=trigger["position"][1],
argument_left=mention["position"][0],
argument_right=mention["position"][1],
labels="NA"
)
if "train" in input_file or self.config.golden_trigger:
example.pred_type = event["type"]
self.examples.append(example)
else:
for neg in item["negative_triggers"]:
is_argument = False
neg_set = set(range(neg["position"][0], neg["position"][1]))
for pos_offset in positive_offsets:
pos_set = set(range(pos_offset[0], pos_offset[1]))
if not pos_set.isdisjoint(neg_set):
is_argument = True
break
if is_argument:
continue
example = EAEInputExample(
example_id=trigger["id"],
text=item["text"],
pred_type=pred_type,
true_type=event["type"],
trigger_left=trigger["position"][0],
trigger_right=trigger["position"][1],
argument_left=neg["position"][0],
argument_right=neg["position"][1],
labels="NA"
)
if "train" in input_file or self.config.golden_trigger:
example.pred_type = event["type"]
self.examples.append(example)
# negative triggers
for trigger in item["negative_triggers"]:
if self.config.eae_eval_mode in ['default', 'strict']:
if self.is_training or self.config.golden_trigger or self.event_preds is None:
pred_type = "NA"
else:
pred_type = self.event_preds[trigger_idx]
if pred_type != "NA":
if "entities" in item:
for entity in item["entities"]:
for mention in entity["mentions"]:
example = EAEInputExample(
example_id=trigger_idx,
text=item["text"],
pred_type=pred_type,
true_type="NA",
trigger_left=trigger["position"][0],
trigger_right=trigger["position"][1],
argument_left=mention["position"][0],
argument_right=mention["position"][1],
labels="NA"
)
self.examples.append(example)
else:
for neg in item["negative_triggers"]:
example = EAEInputExample(
example_id=trigger_idx,
text=item["text"],
pred_type=pred_type,
true_type=event["type"],
trigger_left=trigger["position"][0],
trigger_right=trigger["position"][1],
argument_left=neg["position"][0],
argument_right=neg["position"][1],
labels="NA"
)
if "train" in input_file or self.config.golden_trigger:
example.pred_type = event["type"]
self.examples.append(example)
trigger_idx += 1
else:
for candi in item["candidates"]:
pred_type = self.event_preds[trigger_idx] # we can only use pred type here, gold not available
if pred_type != "NA":
if "entities" in item:
for entity in item["entities"]:
for mention in entity["mentions"]:
example = EAEInputExample(
example_id=trigger_idx,
text=item["text"],
pred_type=pred_type,
true_type="NA",
trigger_left=candi["position"][0],
trigger_right=candi["position"][1],
argument_left=mention["position"][0],
argument_right=mention["position"][1],
labels="NA"
)
self.examples.append(example)
else:
for neg in item["negative_triggers"]:
example = EAEInputExample(
example_id=trigger_idx,
text=item["text"],
pred_type=pred_type,
true_type=event["type"],
trigger_left=trigger["position"][0],
trigger_right=trigger["position"][1],
argument_left=neg["position"][0],
argument_right=neg["position"][1],
labels="NA"
)
if "train" in input_file or self.config.golden_trigger:
example.pred_type = event["type"]
self.examples.append(example)
trigger_idx += 1
if self.event_preds is not None:
assert trigger_idx == len(self.event_preds)
def insert_marker(self,
text: str,
type: str,
trigger_position: List[int],
argument_position: List[int],
markers: Dict[str, str],
whitespace: Optional[bool] = True) -> str:
"""Adds a marker at the start and end position of event triggers and argument mentions."""
markered_text = ""
for i, char in enumerate(text):
if i == trigger_position[0]:
markered_text += markers[type][0]
markered_text += " " if whitespace else ""
if i == argument_position[0]:
markered_text += markers["argument"][0]
markered_text += " " if whitespace else ""
markered_text += char
if i == trigger_position[1] - 1:
markered_text += " " if whitespace else ""
markered_text += markers[type][1]
if i == argument_position[1] - 1:
markered_text += " " if whitespace else ""
markered_text += markers["argument"][1]
return markered_text
def convert_examples_to_features(self) -> None:
"""Converts the `EAEInputExample`s into `EAEInputFeatures`s."""
# merge and then tokenize
self.input_features = []
whitespace = True if self.config.language == "English" else False
for example in tqdm(self.examples, desc="Processing features for TC"):
text = self.insert_marker(example.text,
example.pred_type,
[example.trigger_left, example.trigger_right],
[example.argument_left, example.argument_right],
self.config.markers,
whitespace)
outputs = self.tokenizer(text,
padding="max_length",
truncation=True,
max_length=self.config.max_seq_length)
is_overflow = False
# argument position
try:
argument_left = outputs["input_ids"].index(
self.tokenizer.convert_tokens_to_ids(self.config.markers["argument"][0]))
argument_right = outputs["input_ids"].index(
self.tokenizer.convert_tokens_to_ids(self.config.markers["argument"][1]))
except:
argument_left, argument_right = 0, 0
logger.warning("Argument markers are not in the input tokens.")
is_overflow = True
# trigger position
try:
trigger_left = outputs["input_ids"].index(
self.tokenizer.convert_tokens_to_ids(self.config.markers[example.pred_type][0]))
trigger_right = outputs["input_ids"].index(
self.tokenizer.convert_tokens_to_ids(self.config.markers[example.pred_type][1]))
except:
trigger_left, trigger_right = 0, 0
logger.warning("Trigger markers are not in the input tokens.")
# Roberta tokenizer doesn't return token_type_ids
if "token_type_ids" not in outputs:
outputs["token_type_ids"] = [0] * len(outputs["input_ids"])
features = EAEInputFeatures(
example_id=example.example_id,
input_ids=outputs["input_ids"],
attention_mask=outputs["attention_mask"],
token_type_ids=outputs["token_type_ids"],
trigger_left=trigger_left,
trigger_right=trigger_right,
argument_left=argument_left,
argument_right=argument_right
)
if example.labels is not None:
features.labels = self.config.role2id[example.labels]
if is_overflow:
features.labels = -100
self.input_features.append(features)