1 /++ 2 Print each token id and its string piece for the given text. 3 4 Usage: `tokenize -m model.gguf [-s] [text]` 5 - `-s` include BOS/EOS special tokens (off by default) 6 +/ 7 module tokenize_example; 8 9 import llama; 10 import std.stdio : writefln, writeln, stderr; 11 import core.stdc.locale : setlocale, LC_NUMERIC; 12 import core.stdc.stdio : printf; 13 14 int main(string[] args) 15 { 16 setlocale(LC_NUMERIC, "C"); 17 18 string modelPath; 19 string text = "Hello, world!"; 20 bool addSpecial; 21 22 for (int i = 1; i < cast(int) args.length; i++) 23 { 24 switch (args[i]) 25 { 26 case "-m": 27 if (++i < cast(int) args.length) modelPath = args[i]; 28 else return printUsage(args[0]); 29 break; 30 case "-s": 31 addSpecial = true; 32 break; 33 default: 34 text = args[i]; 35 break; 36 } 37 } 38 if (modelPath.length == 0) return printUsage(args[0]); 39 40 loadAllBackends(); 41 42 // Vocab-only: skips loading weight tensors; all we need for tokenization. 43 auto model = LlamaModel.loadVocabOnly(modelPath); 44 if (!model) 45 { 46 stderr.writefln("error: unable to load model '%s'", modelPath); 47 return 1; 48 } 49 50 const vocab = model.vocab; 51 52 writefln("vocab size : %d", model.nVocab); 53 writefln("BOS token : %d", bosToken(vocab)); 54 writefln("EOS token : %d", eosToken(vocab)); 55 writefln("EOT token : %d", eotToken(vocab)); 56 writeln(); 57 58 auto tokens = tokenize(vocab, text, addSpecial); 59 if (tokens is null) 60 { 61 stderr.writeln("error: tokenization failed"); 62 return 1; 63 } 64 65 writefln("text : \"%s\"", text); 66 writefln("tokens : %d", tokens.length); 67 writeln(); 68 69 writefln("%-8s %-6s %s", "id", "type", "piece"); 70 writefln("%-8s %-6s %s", "--------", "------", "-----"); 71 foreach (t; tokens) 72 { 73 string tag = isEog(vocab, t) ? "EOG" : (t == bosToken(vocab) ? "BOS" : "tok"); 74 writefln("%-8d %-6s \"%s\"", t, tag, tokenToString(vocab, t)); 75 } 76 writeln(); 77 78 // Round-trip check. 79 string roundTrip = detokenize(vocab, tokens); 80 writefln("round-trip : \"%s\"", roundTrip); 81 writeln(roundTrip == text 82 ? "round-trip : OK" 83 : "round-trip : differs (expected if special tokens were added)"); 84 85 return 0; 86 } 87 88 int printUsage(string prog) @trusted nothrow 89 { 90 printf("\nusage: %s -m model.gguf [-s] [text]\n\n" 91 ~ " -m path to a GGUF model file\n" 92 ~ " -s add BOS/EOS special tokens around the text\n\n", 93 prog.ptr); 94 return 1; 95 }