1 /++ 2 Minimal text-completion example. 3 Usage: `simple -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]` 4 +/ 5 module simple; 6 7 import llama; 8 import std.stdio : write, writeln, writefln, stderr; 9 import std.conv : to; 10 import core.stdc.locale : setlocale, LC_NUMERIC; 11 import core.stdc.stdio : printf; 12 13 int main(string[] args) 14 { 15 setlocale(LC_NUMERIC, "C"); 16 17 string modelPath; 18 string prompt = "Hello my name is"; 19 int ngl = 99; 20 int nPredict = 32; 21 22 for (int i = 1; i < cast(int) args.length; i++) 23 { 24 switch (args[i]) 25 { 26 case "-m": 27 if (++i < cast(int) args.length) modelPath = args[i]; 28 else return printUsage(args[0]); 29 break; 30 case "-n": 31 if (++i < cast(int) args.length) nPredict = args[i].to!int; 32 else return printUsage(args[0]); 33 break; 34 case "-ngl": 35 if (++i < cast(int) args.length) ngl = args[i].to!int; 36 else return printUsage(args[0]); 37 break; 38 default: 39 prompt = args[i]; 40 break; 41 } 42 } 43 if (modelPath.length == 0) return printUsage(args[0]); 44 45 loadAllBackends(); 46 47 auto model = LlamaModel.loadFromFile(modelPath, ngl); 48 if (!model) 49 { 50 stderr.writeln("error: unable to load model '", modelPath, "'"); 51 return 1; 52 } 53 54 const vocab = model.vocab; 55 56 auto tokens = tokenize(vocab, prompt); 57 if (tokens.length == 0) 58 { 59 stderr.writeln("error: failed to tokenize prompt"); 60 return 1; 61 } 62 63 // Print prompt as decoded text. 64 foreach (t; tokens) write(tokenToString(vocab, t)); 65 66 auto ctx = LlamaContext.fromModel(model, 67 cast(uint)(tokens.length + nPredict - 1), 68 cast(uint) tokens.length); 69 if (!ctx) 70 { 71 stderr.writeln("error: failed to create llama_context"); 72 return 1; 73 } 74 75 // Two-statement form: avoids copy-constructing the non-copyable SamplerChain. 76 auto smpl = SamplerChain.create(); 77 smpl.greedy(); 78 79 auto batch = batchGetOne(tokens); 80 81 if (model.hasEncoder) 82 { 83 if (ctx.encode(batch)) { stderr.writeln("error: encode failed"); return 1; } 84 // Stack array: avoids pointer-slicing under -preview=safer. 85 llama_token[1] startBuf = [model.decoderStartToken]; 86 batch = batchGetOne(startBuf[]); 87 } 88 89 import core.time : MonoTime; 90 auto tStart = MonoTime.currTime; 91 int nDecode; 92 llama_token[1] nextBuf; // reusable single-token buffer 93 94 for (int nPos; nPos + batch.n_tokens < cast(int) tokens.length + nPredict; ) 95 { 96 if (ctx.decode(batch)) { stderr.writeln("error: decode failed"); return 1; } 97 nPos += batch.n_tokens; 98 99 auto newTok = smpl.sample(ctx); 100 if (isEog(vocab, newTok)) break; 101 102 write(tokenToString(vocab, newTok)); 103 104 import std.stdio : stdout; 105 stdout.flush(); 106 107 nextBuf[0] = newTok; 108 batch = batchGetOne(nextBuf[]); 109 nDecode++; 110 } 111 writeln(); 112 113 // Integer arithmetic: avoids writefln+double template bug under -preview=all. 114 long elapsedMs = (MonoTime.currTime - tStart).total!"msecs"; 115 long tokPerSec = (nDecode > 0 && elapsedMs > 0) ? nDecode * 1000 / elapsedMs : 0; 116 117 stderr.writefln("decoded %d tokens in %d.%02ds, speed: %d t/s", 118 nDecode, elapsedMs / 1000, (elapsedMs % 1000) / 10, tokPerSec); 119 stderr.writeln(); 120 smpl.printPerf(); 121 ctx.printPerf(); 122 stderr.writeln(); 123 124 return 0; 125 } 126 127 int printUsage(string prog) @trusted nothrow 128 { 129 printf("\nusage: %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n\n", 130 prog.ptr); 131 return 1; 132 }