A Transformer model to auto insert Vietnamese accent marks

Finetuning XLM-Roberta to auto insert Vietnamese accent marks (diacritics)
vietnamese accent marks
finetuned xlm-roberta
Author

Hung Hoang

Published

September 3, 2024

This project was completed quite some time ago but the model wasn’t published yet. And now I’m glad to finally be able to make the model available on HuggingFace Models Hub here.

This model was finetuned based on XLM-Roberta (multilingual Roberta), a Transformer encoder, for the task of inserting Vietnamese accent marks.

This accent marks insertion was modelled as a token classification where the assigned label corresponds to the necessary transformation to insert accents.

Below are the major steps with Python code. The codes are purposefully written to illustrate the process.

from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import numpy as np

# Step 1: Load model 

def load_trained_transformer_model():
    model_path = "peterhung/vietnamese-accent-marker-xlm-roberta"
    tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=True)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    return model, tokenizer

model, tokenizer = load_trained_transformer_model()

# Step 2: Run input text through the model 
# only needed if it's run on GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# set to eval mode
model.eval()

def insert_accents(text, model, tokenizer):
    our_tokens = text.strip().split()

    # the tokenizer may further split our tokens
    inputs = tokenizer(our_tokens,
                        is_split_into_words=True,
                        truncation=True,
                        padding=True,
                        return_tensors="pt"
                        )
    input_ids = inputs['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    tokens = tokens[1:-1]

    with torch.no_grad():
        inputs.to(device)
        outputs = model(**inputs)

    predictions = outputs["logits"].cpu().numpy()
    predictions = np.argmax(predictions, axis=2)

    # exclude output at index 0 and the last index, which correspond to '<s>' and '</s>'
    predictions = predictions[0][1:-1]

    assert len(tokens) == len(predictions)

    return tokens, predictions 


text = "Nhin nhung mua thu di, em nghe sau len trong nang."
tokens, predictions = insert_accents(text, model, tokenizer)

# Step3: Obtain the accented words 
def _load_tags_set(fpath):
    labels = []
    with open(fpath, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                labels.append(line)

    return labels
    
label_list = _load_tags_set("./selected_tags_names.txt")
assert len(label_list) == 528, f"Expect {len(label_list)} tags"


TOKENIZER_WORD_PREFIX = "▁"
def merge_tokens_and_preds(tokens, predictions): 
    merged_tokens_preds = []
    i = 0
    while i < len(tokens):
        tok = tokens[i]
        label_indexes = set([predictions[i]])
        if tok.startswith(TOKENIZER_WORD_PREFIX): # start a new word
            tok_no_prefix = tok[len(TOKENIZER_WORD_PREFIX):]
            cur_word_toks = [tok_no_prefix]
            # check if subsequent toks are part of this word
            j = i + 1
            while j < len(tokens):
                if not tokens[j].startswith(TOKENIZER_WORD_PREFIX):
                    cur_word_toks.append(tokens[j])
                    label_indexes.add(predictions[j])
                    j += 1
                else:
                    break
            cur_word = ''.join(cur_word_toks)
            merged_tokens_preds.append((cur_word, label_indexes))
            i = j
        else:
            merged_tokens_preds.append((tok, label_indexes))
            i += 1

    return merged_tokens_preds


merged_tokens_preds = merge_tokens_and_preds(tokens, predictions)
print(merged_tokens_preds)
# Output: [('Nhin', {217}), ('nhung', {388}), ('mua', {407}), ('thu', {378}), ('di,', {120, 0}), ('em', {185}), ('nghe', {185}), ('sau', {41}), ('len', {188}), ('trong', {302}), ('nang.', {0, 14})]

# Step 3b: Replace the tag name with actual accent marks 
def get_accented_words(merged_tokens_preds, label_list):
    accented_words = []
    for word_raw, label_indexes in merged_tokens_preds:
        # use the first label that changes word_raw
        for label_index in label_indexes:
            tag_name = label_list[int(label_index)]
            raw, vowel = tag_name.split("-")
            if raw and raw in word_raw:
                word_accented = word_raw.replace(raw, vowel)
                break
        else:
            word_accented = word_raw

        accented_words.append(word_accented)

    return accented_words


accented_words = get_accented_words(merged_tokens_preds, label_list)
print(accented_words)
# Output: ['Nhìn', 'những', 'mùa', 'thu', 'đi,', 'em', 'nghe', 'sâu', 'lên', 'trong', 'nắng.']

For more details and information, refer to this Model page.