This project was completed quite some time ago but the model wasn’t published yet. And now I’m glad to finally be able to make the model available on HuggingFace Models Hub here.
This model was finetuned based on XLM-Roberta (multilingual Roberta), a Transformer encoder, for the task of inserting Vietnamese accent marks.
This accent marks insertion was modelled as a token classification where the assigned label corresponds to the necessary transformation to insert accents.
Below are the major steps with Python code. The codes are purposefully written to illustrate the process.
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import numpy as np
# Step 1: Load model
def load_trained_transformer_model():
model_path = "peterhung/vietnamese-accent-marker-xlm-roberta"
tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_path)
return model, tokenizer
model, tokenizer = load_trained_transformer_model()
# Step 2: Run input text through the model
# only needed if it's run on GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# set to eval mode
model.eval()
def insert_accents(text, model, tokenizer):
our_tokens = text.strip().split()
# the tokenizer may further split our tokens
inputs = tokenizer(our_tokens,
is_split_into_words=True,
truncation=True,
padding=True,
return_tensors="pt"
)
input_ids = inputs['input_ids']
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
tokens = tokens[1:-1]
with torch.no_grad():
inputs.to(device)
outputs = model(**inputs)
predictions = outputs["logits"].cpu().numpy()
predictions = np.argmax(predictions, axis=2)
# exclude output at index 0 and the last index, which correspond to '<s>' and '</s>'
predictions = predictions[0][1:-1]
assert len(tokens) == len(predictions)
return tokens, predictions
text = "Nhin nhung mua thu di, em nghe sau len trong nang."
tokens, predictions = insert_accents(text, model, tokenizer)
# Step3: Obtain the accented words
def _load_tags_set(fpath):
labels = []
with open(fpath, 'r') as f:
for line in f:
line = line.strip()
if line:
labels.append(line)
return labels
label_list = _load_tags_set("./selected_tags_names.txt")
assert len(label_list) == 528, f"Expect {len(label_list)} tags"
TOKENIZER_WORD_PREFIX = "▁"
def merge_tokens_and_preds(tokens, predictions):
merged_tokens_preds = []
i = 0
while i < len(tokens):
tok = tokens[i]
label_indexes = set([predictions[i]])
if tok.startswith(TOKENIZER_WORD_PREFIX): # start a new word
tok_no_prefix = tok[len(TOKENIZER_WORD_PREFIX):]
cur_word_toks = [tok_no_prefix]
# check if subsequent toks are part of this word
j = i + 1
while j < len(tokens):
if not tokens[j].startswith(TOKENIZER_WORD_PREFIX):
cur_word_toks.append(tokens[j])
label_indexes.add(predictions[j])
j += 1
else:
break
cur_word = ''.join(cur_word_toks)
merged_tokens_preds.append((cur_word, label_indexes))
i = j
else:
merged_tokens_preds.append((tok, label_indexes))
i += 1
return merged_tokens_preds
merged_tokens_preds = merge_tokens_and_preds(tokens, predictions)
print(merged_tokens_preds)
# Output: [('Nhin', {217}), ('nhung', {388}), ('mua', {407}), ('thu', {378}), ('di,', {120, 0}), ('em', {185}), ('nghe', {185}), ('sau', {41}), ('len', {188}), ('trong', {302}), ('nang.', {0, 14})]
# Step 3b: Replace the tag name with actual accent marks
def get_accented_words(merged_tokens_preds, label_list):
accented_words = []
for word_raw, label_indexes in merged_tokens_preds:
# use the first label that changes word_raw
for label_index in label_indexes:
tag_name = label_list[int(label_index)]
raw, vowel = tag_name.split("-")
if raw and raw in word_raw:
word_accented = word_raw.replace(raw, vowel)
break
else:
word_accented = word_raw
accented_words.append(word_accented)
return accented_words
accented_words = get_accented_words(merged_tokens_preds, label_list)
print(accented_words)
# Output: ['Nhìn', 'những', 'mùa', 'thu', 'đi,', 'em', 'nghe', 'sâu', 'lên', 'trong', 'nắng.']For more details and information, refer to this Model page.