Tokenizer
import collections
import logging
import numpy as np
import os
import pdb
from transformers import PreTrainedTokenizer
from typing import Dict, Iterable, List, Optional, Tuple, Union
logger = logging.getLogger(__name__)
load_vocab
Loads a vocabulary file, allocates a unique id for each word within the vocabulary and saves the correspondence between words and ids into a dictionary. Generates and returns word embeddings if it is required.
Args:
vocab_file
: The path of the vocabulary file.return_embeddings
: Whether or not to return the word embeddings.
Returns:
word_embeddings
: An numpy array represents each word’s embedding within the vocabulary, with the size of (number of words) * (embedding dimension). Returns word embeddings ifreturn_embeddings
is set asTrue
.vocab
: A dictionary indicates the unique id of each word within the vocabulary.
def load_vocab(vocab_file: str,
return_embeddings: bool = False) -> Union[Dict[str, int], np.ndarray]:
"""Loads a vocabulary file into a dictionary.
Loads a vocabulary file, allocates a unique id for each word within the vocabulary and saves the correspondence
between words and ids into a dictionary. Generates and returns word embeddings if it is required.
Args:
vocab_file (`str`):
The path of the vocabulary file.
return_embeddings (`bool`, `optional`, defaults to `False`):
Whether or not to return the word embeddings.
Returns:
word_embeddings (`np.ndarray`):
An numpy array represents each word's embedding within the vocabulary, with the size of (number of words) *
(embedding dimension). Returns word embeddings if `return_embeddings` is set as True.
vocab (`Dict[str, int]`):
A dictionary indicates the unique id of each word within the vocabulary.
"""
vocab = collections.OrderedDict()
vocab["[PAD]"] = 0
with open(vocab_file, "r", encoding="utf-8") as reader:
lines = reader.readlines()
num_embeddings = len(lines) + 1
embedding_dim = len(lines[0].split()) - 1
for index, line in enumerate(lines):
token = " ".join(line.split()[:-embedding_dim])
if token in vocab:
token = f"{token}_{index+1}"
vocab[token] = index + 1
if return_embeddings:
word_embeddings = np.zeros((num_embeddings, embedding_dim), dtype=np.float32)
for index, line in enumerate(lines):
embedding = [float(value) for value in line.strip().split()[-embedding_dim:]]
word_embeddings[index+1] = embedding
return word_embeddings
return vocab
whitespace_tokenize()
Cleans the whitespace at the beginning and end of the text and splits the text into a list based on whitespaces.
Args:
- tex
: A string representing the input text to be processed.
Returns:
tokens
: A list of strings in which each element represents a word within the input text.
def whitespace_tokenize(text: str) -> List[str]:
"""Runs basic whitespace cleaning and splitting on a piece of text.
Cleans the whitespace at the beginning and end of the text and splits the text into a list based on whitespaces.
Args:
text (`str`):
A string representing the input text to be processed.
Returns:
tokens (`List[str]`):
A list of strings in which each element represents a word within the input text.
"""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
WordLevelTokenizer
This tokenizer inherits from PreTrainedTokenizer
which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Attributes:
vocab
: A dictionary indicating the correspondence between words and ids within the vocabulary.ids_to_tokens
: A dictionary indicating the correspondence between ids and words within the vocabulary.whitespace_tokenizer
: AWhitespaceTokenizer
instance for word piece tokenization.
VOCAB_FILES_NAMES = {"vocab_file": "vec.txt"}
PRETRAINED_VOCAB_FILES_MAP = {}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
PRETRAINED_INIT_CONFIGURATION = {}
class WordLevelTokenizer(PreTrainedTokenizer):
"""Construct a BERT tokenizer. Based on WordPiece.
This tokenizer inherits from `PreTrainedTokenizer` which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Attributes:
vocab (`Dict[str, int]`):
A dictionary indicating the correspondence between words and ids within the vocabulary.
ids_to_tokens (`Dict[int, str]`):
A dictionary indicating the correspondence between ids and words within the vocabulary.
whitespace_tokenizer (`WhitespaceTokenizer`):
A `WhitespaceTokenizer` instance for word piece tokenization.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self,
vocab_file: str,
do_lower_case: bool = True,
never_split: Iterable = None,
unk_token: str = "[UNK]",
sep_token: str = "[SEP]",
pad_token: str = "[PAD]",
cls_token: str = "[CLS]",
strip_accents: bool = None,
model_max_length: int = 512,
**kwargs):
"""Construct a WordLevelTokenizer."""
kwargs["model_max_length"] = model_max_length
super().__init__(
do_lower_case=do_lower_case,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
strip_accents=strip_accents,
**kwargs,
)
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
# insert special token
for token in [unk_token, sep_token, pad_token, cls_token]:
if token not in self.vocab:
self.vocab[token] = len(self.vocab)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.whitespace_tokenizer = WhitespaceTokenizer(vocab=self.vocab, do_lower_case=do_lower_case,
unk_token=self.unk_token)
@property
def do_lower_case(self):
"""Returns whether or not to lowercase the input when tokenizing."""
return self.whitespace_tokenizer.do_lower_case
@property
def vocab_size(self):
"""Returns the length of the vocabulary"""
return len(self.vocab)
def get_vocab(self):
"""Returns the vocabulary in a dictionary."""
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self,
text: str):
"""Tokenizes the input text into tokens."""
if self.do_lower_case:
text = text.lower()
split_tokens = self.whitespace_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self,
token: str):
"""Converts a token (`str`) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self,
index: int):
"""Converts an index (`int`) in a token (`str`) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self,
tokens: str):
"""Converts a sequence of tokens (`str`) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""Builds model inputs from a sequence or a pair of sequence.
Builds model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of ids to which the special tokens will be added.
token_ids_1 (`List[int]`, `optional`):
Optional second list of ids for sequence pairs.
Returns:
`List[int]`: List of [input ids](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False) -> List[int]:
"""Retrieve sequence ids from a token list that has no special tokens added."""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""Create a mask from the two sequences passed to be used in a sequence-pair classification task."""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self,
save_directory: str,
filename_prefix: Optional[str] = None) -> Tuple[str]:
"""Saves the vocabulary (copy original file) and special tokens file to a directory."""
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
WhitespaceTokenizer
Tokenizes a piece of text into its word pieces by matching whether the token is in the vocabulary.
Attributes:
vocab
: A dictionary indicates the correspondence between words and ids within the vocabulary.do_lower_case
: A boolean variable indicating Whether or not to lowercase the input when tokenizing.unk_token
: A string representing the unknown token.
class WhitespaceTokenizer(object):
"""A tokenizer to conduct word piece tokenization.
Tokenizes a piece of text into its word pieces by matching whether the token is in the vocabulary.
Attributes:
vocab (`Dict[str, int]`):
A dictionary indicates the correspondence between words and ids within the vocabulary.
do_lower_case (`bool`):
A boolean variable indicating Whether or not to lowercase the input when tokenizing.
unk_token (`str`):
A string representing the unknown token.
"""
def __init__(self,
vocab: Dict[str, int],
do_lower_case: bool,
unk_token: str):
"""Constructs a `WhitespaceTokenizer`."""
self.vocab = vocab
self.do_lower_case = do_lower_case
self.unk_token = unk_token
def tokenize(self,
text: str) -> List[str]:
"""Tokenizes a piece of text into its word pieces."""
output_tokens = []
for token in whitespace_tokenize(text):
if token in self.vocab:
output_tokens.append(token)
else:
output_tokens.append(self.unk_token)
return output_tokens