Dump Results

import jsonlines
import json
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from typing import List, Dict, Union, Tuple
from .convert_format import get_pred_per_mention
from .metric import select_start_position
from ..input_engineering.input_utils import check_pred_len, get_left_and_right_pos

get_sentence_arguments

Gets the predicted arguments from a sentence in the Sequence Labeling paradigm.

Args:

  • input_sentence: A list of dictionaries each of which contains the word and the corresponding bio-role.

Returns:

  • arguments: A list of dictionaries each of which contains the word and the corresponding role.

def get_sentence_arguments(input_sentence: List[Dict[str, str]]) -> List[Dict[str, str]]:
    """Get the predicted arguments from a sentence in the Sequence Labeling paradigm.
    Args:
        input_sentence (`List[Dict[str, str]]`):
            A list of dictionaries each of which contains the word and the corresponding bio-role.
    Returns:
        arguments (`List[Dict[str, str]]`):
            A list of dictionaries each of which contains the word and the corresponding role.
    """
    input_sentence.append({"role": "NA", "word": "<EOS>"})
    arguments = []

    previous_role = None
    previous_arg = ""
    for item in input_sentence:
        if item["role"] != "NA" and previous_role is None:
            previous_role = item["role"]
            previous_arg = item["word"]

        elif item["role"] == previous_role:
            previous_arg += item["word"]

        elif item["role"] != "NA":
            arguments.append({"role": previous_role, "argument": previous_arg})
            previous_role = item["role"]
            previous_arg = item["word"]

        elif previous_role is not None:
            arguments.append({"role": previous_role, "argument": previous_arg})
            previous_role = None
            previous_arg = ""

    return arguments

get_maven_submission

Converts the predictions to the submission format of the MAVEN dataset and dumps the predictions into a json file.

Args:

  • preds: A list of strings indicating the predicted types of the instances.

  • instance_ids: A list of strings containing the id of each instance to be predicted.

  • result_file: A string indicating the path to place the written json file.

def get_maven_submission(preds: Union[np.array, List[str]],
                         instance_ids: List[str],
                         result_file: str) -> None:
    """Converts the predictions to the submission format of the MAVEN dataset.
    Converts the predictions to the submission format of the MAVEN dataset and dumps the predictions into a json file.
    Args:
        preds (`List[str]`):
            A list of strings indicating the predicted types of the instances.
        instance_ids (`List[str]`):
            A list of strings containing the id of each instance to be predicted.
        result_file (`str`):
            A string indicating the path to place the written json file.
    """
    all_results = defaultdict(list)
    for i, pred in enumerate(preds):
        example_id, candidate_id = instance_ids[i].split("-")
        all_results[example_id].append({
            "id": candidate_id,
            "type_id": int(pred)
        })
    with open(result_file, "w") as f:
        for data_id in all_results.keys():
            format_result = dict(id=data_id, predictions=[])
            for candidate in all_results[data_id]:
                format_result["predictions"].append(candidate)
            f.write(json.dumps(format_result) + "\n")

get_maven_submission_sl

Obtains the instances’ predictions in the test file of the MAVEN dataset based on the sequence labeling paradigm and converts the predictions to the dataset’s submission format. The converted predictions are dumped into a json file for submission.

Args:

  • preds: A list of strings indicating the predicted types of the instances.

  • labels: A list of strings indicating the actual labels of the instances.

  • result_file: A string indicating the path to place the written json file.

  • type2id: A dictionary containing the correspondences between event types and ids.

  • config: The configurations of the model.

def get_maven_submission_sl(preds: Union[np.array, List[str]],
                            labels: Union[np.array, List[str]],
                            is_overflow,
                            result_file: str,
                            type2id: Dict[str, int],
                            config) -> None:
    """Converts the predictions to the submission format of the MAVEN dataset based on the sequence labeling paradigm.
    Obtains the instances' predictions in the test file of the MAVEN dataset based on the sequence labeling paradigm and
    converts the predictions to the dataset's submission format. The converted predictions are dumped into a json file
    for submission.
    Args:
        preds (`List[str]`):
            A list of strings indicating the predicted types of the instances.
        labels (`List[str]`):
            A list of strings indicating the actual labels of the instances.
        is_overflow:
        result_file (`str`):
            A string indicating the path to place the written json file.
        type2id (`Dict[str, int]`):
            A dictionary containing the correspondences between event types and ids.
        config:
            The configurations of the model.
    """
    # get per-word predictions
    preds, _ = select_start_position(preds, labels, False)
    results = defaultdict(list)
    language = config.language

    with open(config.test_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            item = json.loads(line.strip())
            text = item["text"]

            # check for alignment
            if not is_overflow[i]:
                check_pred_len(pred=preds[i], item=item, language=language)

            for candidate in item["candidates"]:
                # get word positions
                word_pos_start, word_pos_end = get_left_and_right_pos(text=text, trigger=candidate, language=language)
                # get predictions
                pred = get_pred_per_mention(word_pos_start, word_pos_end, preds[i], config.id2type)
                # record results
                results[item["id"]].append({
                    "id": candidate["id"].split("-")[-1],
                    "type_id": int(type2id[pred]),
                })
    # dump results
    with open(result_file, "w") as f:
        for id, preds_per_doc in results.items():
            results_per_doc = dict(id=id, predictions=preds_per_doc)
            f.write(json.dumps(results_per_doc)+"\n")

get_maven_submission_seq2seq

Obtains the instances’ predictions in the test file of the MAVEN dataset based on the Sequence-to-Sequence (Seq2Seq) paradigm and converts the predictions to the dataset’s submission format. The converted predictions are dumped into a json file for submission.

Args:

  • preds: The textual predictions of the Event Type or Argument Role. A list of tuple lists, in which each tuple is (argument, role) or (trigger, event_type)

  • save_path: A string indicating the path to place the written json file.

  • data_args: The pre-defined arguments for data processing.

def get_maven_submission_seq2seq(preds: List[List[Tuple[str, str]]],
                                 save_path: str,
                                 data_args) -> None:
    """Converts the predictions to the submission format of the MAVEN dataset based on the Seq2Seq paradigm.
    Obtains the instances' predictions in the test file of the MAVEN dataset based on the Sequence-to-Sequence (Seq2Seq)
    paradigm and converts the predictions to the dataset's submission format. The converted predictions are dumped into
    a json file for submission.
    Args:
        preds (`List[List[Tuple[str, str]]]`):
            The textual predictions of the Event Type or Argument Role.
            A list of tuple lists, in which each tuple is (argument, role) or (trigger, event_type)
        save_path (`str`):
            A string indicating the path to place the written json file.
        data_args:
            The pre-defined arguments for data processing.
    """
    type2id = data_args.type2id
    results = defaultdict(list)
    with open(data_args.test_file, "r") as f:
        lines = f.readlines()
        for idx, line in enumerate(lines):
            item = json.loads(line.strip())
            text = item["text"]
            preds_per_idx = preds[idx]

            for candidate in item["candidates"]:
                label = "NA"
                left_pos, right_pos = candidate["position"]
                # get predictions
                pred_type = get_pred_per_mention(pos_start=left_pos, pos_end=right_pos, preds=preds_per_idx, text=text,
                                                 label=label, label2id=type2id, paradigm='s2s')

                # record results
                results[item["id"]].append({"id": candidate["id"].split("-")[-1], "type_id": int(type2id[pred_type])})
    # dump results
    with open(save_path, "w") as f:
        for id, preds_per_doc in results.items():
            results_per_doc = dict(id=id, predictions=preds_per_doc)
            f.write(json.dumps(results_per_doc) + "\n")

get_leven_submission

Converts the predictions to the submission format of the LEVEN dataset and dumps the predictions into a json file.

Args:

  • preds: A list of strings indicating the predicted types of the instances.

  • instance_ids: A list of strings containing the id of each instance to be predicted.

  • result_file: A string indicating the path to place the written json file.

Returns:

  • The parameters of the input are passed to the get_maven_submission() method for further predictions.

def get_leven_submission(preds: Union[np.array, List[str]],
                         instance_ids: List[str],
                         result_file: str) -> None:
    """Converts the predictions to the submission format of the LEVEN dataset.
    Converts the predictions to the submission format of the LEVEN dataset and dumps the predictions into a json file.
    Args:
        preds (`List[str]`):
            A list of strings indicating the predicted types of the instances.
        instance_ids (`List[str]`):
            A list of strings containing the id of each instance to be predicted.
        result_file (`str`):
            A string indicating the path to place the written json file.
    Returns:
        The parameters of the input are passed to the `get_maven_submission()` method for further predictions.
    """
    return get_maven_submission(preds, instance_ids, result_file)

get_leven_submission_sl

Obtains the instances’ predictions in the test file of the LEVEN dataset based on the sequence labeling paradigm and converts the predictions to the dataset’s submission format. The converted predictions are dumped into a json file for submission.

Args:

  • preds: A list of strings indicating the predicted type of the instances.

  • labels: A list of strings indicating the actual label of the instances.

  • result_file: A string indicating the path to place the written json file.

  • type2id: A dictionary containing the correspondences between event types and ids.

  • config: The configurations of the model.

Returns:

  • The parameters of the input are passed to the get_maven_submission_sl() method for further predictions.

def get_leven_submission_sl(preds: Union[np.array, List[str]],
                            labels: Union[np.array, List[str]],
                            is_overflow,
                            result_file: str,
                            type2id: Dict[str, int],
                            config):
    """Converts the predictions to the submission format of the LEVEN dataset based on the sequence labeling paradigm.
    Obtains the instances' predictions in the test file of the LEVEN dataset based on the sequence labeling paradigm and
    converts the predictions to the dataset's submission format. The converted predictions are dumped into a json file
    for submission.
    Args:
        preds (`List[str]`):
            A list of strings indicating the predicted type of the instances.
        labels (`List[str]`):
            A list of strings indicating the actual label of the instances.
        is_overflow:
        result_file (`str`):
            A string indicating the path to place the written json file.
        type2id (`Dict[str, int]`):
            A dictionary containing the correspondences between event types and ids.
        config:
            The configurations of the model.
    Returns:
        The parameters of the input are passed to the `get_maven_submission_sl()` method for further predictions.
    """
    return get_maven_submission_sl(preds, labels, is_overflow, result_file, type2id, config)

get_leven_submission_seq2seq

Obtains the instances’ predictions in the test file of the LEVEN dataset based on the Sequence-to-Sequence (Seq2Seq) paradigm and converts the predictions to the dataset’s submission format. The converted predictions are dumped into a json file for submission.

Args:

  • preds: The textual predictions of the Event Type or Argument Role. A list of tuple lists, in which each tuple is (argument, role) or (trigger, event_type)

  • save_path: A string indicating the path to place the written json file.

  • data_args: The pre-defined arguments for data processing.

Returns:

  • The parameters of the input are passed to the get_maven_submission_seq2seq() method for further predictions. The formats of LEVEN and MAVEN are the same.

def get_leven_submission_seq2seq(preds: List[List[Tuple[str, str]]],
                                 save_path: str,
                                 data_args) -> None:
    """Converts the predictions to the submission format of the LEVEN dataset based on the Seq2Seq paradigm.
    Obtains the instances' predictions in the test file of the LEVEN dataset based on the Sequence-to-Sequence (Seq2Seq)
    paradigm and converts the predictions to the dataset's submission format. The converted predictions are dumped into
    a json file for submission.
    Args:
        preds (`List[List[Tuple[str, str]]]`):
            The textual predictions of the Event Type or Argument Role.
            A list of tuple lists, in which each tuple is (argument, role) or (trigger, event_type)
        save_path (`str`):
            A string indicating the path to place the written json file.
        data_args:
            The pre-defined arguments for data processing.
    Returns:
        The parameters of the input are passed to the `get_maven_submission_seq2seq()` method for further predictions.
        The formats of LEVEN and MAVEN are the same.
    """
    return get_maven_submission_seq2seq(preds, save_path, data_args)

get_duee_submission_sl

Args:

  • preds: A list of strings indicating the predicted types of the instances.

  • labels: A list of strings indicating the actual labels of the instances.

  • result_file: A string indicating the path to place the written json file.

  • config: The configurations of the model.

Returns:

all_results: A list of dictionaries containing the predictions of events.

def get_duee_submission_sl(preds: Union[np.array, List[str]],
                           labels: Union[np.array, List[str]],
                           is_overflow,
                           result_file: str,
                           config) -> List[Dict[str, Union[str, Dict]]]:
    """Converts the predictions to the submission format of the DuEE dataset based on the sequence labeling paradigm.
    Obtains the instances' predictions in the test file of the DuEE dataset based on the sequence labeling paradigm and
    converts the predictions to the dataset's submission format. The converted predictions are dumped into a json file
    for submission.
    Args:
        preds (`List[str]`):
            A list of strings indicating the predicted types of the instances.
        labels (`List[str]`):
            A list of strings indicating the actual labels of the instances.
        is_overflow:
        result_file (`str`):
            A string indicating the path to place the written json file.
        config:
            The configurations of the model.
    Returns:
        all_results (`List[Dict[str, Union[str, Dict]]]`):
            A list of dictionaries containing the predictions of events.
    """
    # trigger predictions
    ed_preds = json.load(open(config.test_pred_file))

    # get per-word predictions
    preds, labels = select_start_position(preds, labels, False)
    all_results = []

    with open(config.test_file, "r", encoding='utf-8') as f:
        trigger_idx = 0
        example_idx = 0
        lines = f.readlines()
        for line in tqdm(lines, desc='Generating DuEE1.0 Submission Files'):
            item = json.loads(line.strip())

            item_id = item["id"]
            event_list = []

            for tid, trigger in enumerate(item["candidates"]):
                pred_event_type = ed_preds[trigger_idx]
                if pred_event_type != "NA":
                    if not is_overflow[example_idx]:
                        if config.language == "English":
                            assert len(preds[example_idx]) == len(item["text"].split())
                        elif config.language == "Chinese":
                            assert len(preds[example_idx]) == len("".join(item["text"].split()))  # remove space token
                        else:
                            raise NotImplementedError

                    pred_event = dict(event_type=pred_event_type, arguments=[])
                    sentence_result = []
                    for cid, candidate in enumerate(item["candidates"]):
                        if cid == tid:
                            continue
                        char_pos = candidate["position"]
                        if config.language == "English":
                            word_pos_start = len(item["text"][:char_pos[0]].split())
                            word_pos_end = word_pos_start + len(item["text"][char_pos[0]:char_pos[1]].split())
                        elif config.language == "Chinese":
                            word_pos_start = len([w for w in item["text"][:char_pos[0]] if w.strip('\n\xa0� ')])
                            word_pos_end = len([w for w in item["text"][:char_pos[1]] if w.strip('\n\xa0� ')])
                        else:
                            raise NotImplementedError
                        # get predictions
                        pred = get_pred_per_mention(word_pos_start, word_pos_end, preds[example_idx], config.id2role)
                        sentence_result.append({"role": pred, "word": candidate["trigger_word"]})

                    pred_event["arguments"] = get_sentence_arguments(sentence_result)
                    if pred_event["arguments"]:
                        event_list.append(pred_event)

                    example_idx += 1

                trigger_idx += 1

            all_results.append({"id": item_id, "event_list": event_list})

    # dump results
    with jsonlines.open(result_file, "w") as f:
        for r in all_results:
            jsonlines.Writer.write(f, r)

    return all_results