1 /++
2 Print each token id and its string piece for the given text.
3 
4 Usage: `tokenize -m model.gguf [-s] [text]`
5 - `-s` include BOS/EOS special tokens (off by default)
6 +/
7 module tokenize_example;
8 
9 import llama;
10 import std.stdio  : writefln, writeln, stderr;
11 import core.stdc.locale : setlocale, LC_NUMERIC;
12 import core.stdc.stdio  : printf;
13 
14 int main(string[] args)
15 {
16     setlocale(LC_NUMERIC, "C");
17 
18     string modelPath;
19     string text      = "Hello, world!";
20     bool   addSpecial;
21 
22     for (int i = 1; i < cast(int) args.length; i++)
23     {
24         switch (args[i])
25         {
26             case "-m":
27                 if (++i < cast(int) args.length) modelPath = args[i];
28                 else return printUsage(args[0]);
29                 break;
30             case "-s":
31                 addSpecial = true;
32                 break;
33             default:
34                 text = args[i];
35                 break;
36         }
37     }
38     if (modelPath.length == 0) return printUsage(args[0]);
39 
40     loadAllBackends();
41 
42     // Vocab-only: skips loading weight tensors; all we need for tokenization.
43     auto model = LlamaModel.loadVocabOnly(modelPath);
44     if (!model)
45     {
46         stderr.writefln("error: unable to load model '%s'", modelPath);
47         return 1;
48     }
49 
50     const vocab = model.vocab;
51 
52     writefln("vocab size : %d", model.nVocab);
53     writefln("BOS token  : %d", bosToken(vocab));
54     writefln("EOS token  : %d", eosToken(vocab));
55     writefln("EOT token  : %d", eotToken(vocab));
56     writeln();
57 
58     auto tokens = tokenize(vocab, text, addSpecial);
59     if (tokens is null)
60     {
61         stderr.writeln("error: tokenization failed");
62         return 1;
63     }
64 
65     writefln("text       : \"%s\"", text);
66     writefln("tokens     : %d", tokens.length);
67     writeln();
68 
69     writefln("%-8s  %-6s  %s", "id", "type", "piece");
70     writefln("%-8s  %-6s  %s", "--------", "------", "-----");
71     foreach (t; tokens)
72     {
73         string tag = isEog(vocab, t) ? "EOG" : (t == bosToken(vocab) ? "BOS" : "tok");
74         writefln("%-8d  %-6s  \"%s\"", t, tag, tokenToString(vocab, t));
75     }
76     writeln();
77 
78     // Round-trip check.
79     string roundTrip = detokenize(vocab, tokens);
80     writefln("round-trip : \"%s\"", roundTrip);
81     writeln(roundTrip == text
82         ? "round-trip : OK"
83         : "round-trip : differs (expected if special tokens were added)");
84 
85     return 0;
86 }
87 
88 int printUsage(string prog) @trusted nothrow
89 {
90     printf("\nusage: %s -m model.gguf [-s] [text]\n\n"
91            ~ "  -m  path to a GGUF model file\n"
92            ~ "  -s  add BOS/EOS special tokens around the text\n\n",
93            prog.ptr);
94     return 1;
95 }