OpenNMT · l-k-11235 · Dec 29, 2023 · Dec 29, 2023 · Dec 29, 2023 · Jan 2, 2024
diff --git a/eval_llm/WIKITEXT2/run_wikitext-2_benchmark.py b/eval_llm/WIKITEXT2/run_wikitext-2_benchmark.py
@@ -2,16 +2,14 @@
 import json
 import numpy as np
 import os
-import pyonmttok
+import sentencepiece as spm
 import time
-from onmt.constants import CorpusTask, DefaultTokens
+from onmt.constants import DefaultTokens
 from onmt.inference_engine import InferenceEnginePY
-from onmt.inputters.dynamic_iterator import build_dynamic_dataset_iter
 import onmt.opts as opts
 from onmt.utils.logging import init_logger
 from onmt.utils.parse import ArgumentParser
 from onmt.utils.misc import use_gpu, set_random_seed
-from onmt.transforms import get_transforms_cls
 
 
 def compute_file_ppl(output_filename):
@@ -30,63 +28,27 @@ def compute_file_ppl(output_filename):
 def tokenize_dataset(opt, context_length):
     print("Tokenization...")
 
-    # Prepare the dataset
+    # Concat the dataset
     x = open(opt.src, "r").readlines()
     x = [_x.rstrip("\n") for _x in x]
     y = DefaultTokens.SEP.join(x)
 
     with open(opt.src + ".temp", "w") as writer:
         writer.write(y)
 
-    # ########################## #
-    # Build the dataset iterator #
-    # ########################## #
-
-    # Build the vocab
-    vocab_path_in = "/nas-labs/LM/big_llms/llama/7B/llama.vocab"
-    voc = []
-    with open(vocab_path_in, "r", encoding="utf-8") as reader:
-        for line in reader:
-            line = line.strip("\n")
-            voc.append(line)
-    vocabs = {}
-    src_vocab = pyonmttok.build_vocab_from_tokens(voc)
-    vocabs["src"] = src_vocab
-    vocabs["tgt"] = src_vocab
-    vocabs["data_task"] = "lm"
-    vocabs["decoder_start_token"] = "<s>"
-
-    transforms_cls = get_transforms_cls(opt._all_transform)
-
-    new_opt = opt
-    new_opt.gpu = -1
-    new_opt.parallel_mode = "data_parallel"
-    new_opt.src = opt.src + ".temp"
-
-    dataset_iter = build_dynamic_dataset_iter(
-        new_opt, transforms_cls, vocabs, task=CorpusTask.INFER, device_id=-1
-    )
+    # Tokenize
+    SP = spm.SentencePieceProcessor(opt.src_subword_model)
+    x = open(opt.src + ".temp", "r").readlines()
+    tokens = SP.encode(x[0], out_type=str)
 
-    input_tokens = []
-    for batch, i in dataset_iter:
-        for i in range(batch["src"].size()[0]):
-            start_ids = batch["src"][i, :, 0].cpu().numpy().tolist()
-            input_tokens += [
-                vocabs["src"].lookup_index(id)
-                for id in start_ids
-                if id != vocabs["src"].lookup_token(DefaultTokens.PAD)
-            ]
 
     def make_chunks(lst, n):
         """Yield successive n-sized chunks from lst."""
         for i in range(0, len(lst), n):
             yield lst[i : i + n]
 
-    # #################### #
-    # Tokenize the dataset #
-    # ################### #
     with open(opt.src + f".tokenized.context_{context_length}", "w") as writer:
-        for _chunk in make_chunks(input_tokens, context_length - 1):
+        for _chunk in make_chunks(tokens, context_length - 1):
             writer.write(" ".join(_chunk) + "\n")
             print(len(_chunk))