1 /++
2 Multimodal inference CLI — feed an image (and optional text) to a vision model.
3 
4 Usage:
5 ---
6 multimodal -m model.gguf --mmproj mmproj.gguf \
7            [-i image.jpg] [-n n_predict] [-ngl n_gpu_layers] [prompt]
8 ---
9 
10 The language model and projector must be compatible (same architecture).
11 If no image is supplied the tool behaves like a plain text-completion CLI.
12 +/
13 module multimodal;
14 
15 import llama;
16 import llama.mtmd;
17 import std.stdio  : write, writeln, writefln, stderr;
18 import std.conv   : to;
19 import std.string : fromStringz;
20 import core.stdc.locale : setlocale, LC_NUMERIC;
21 import core.stdc.stdio  : printf;
22 
23 // ────────────────────────────────────────────────────────────────────────────
24 
25 int main(string[] args) @trusted
26 {
27     setlocale(LC_NUMERIC, "C");
28 
29     string modelPath;
30     string mmprojPath;
31     string imagePath;
32     string prompt   = "Describe the image in detail.";
33     int    ngl      = 99;
34     int    nPredict = 512;
35     int    nBatch   = 512;
36     bool   useGpu   = true;
37 
38     for (int i = 1; i < cast(int) args.length; i++)
39     {
40         switch (args[i])
41         {
42             case "-m":
43                 if (++i < cast(int) args.length) modelPath   = args[i];
44                 else return printUsage(args[0]);
45                 break;
46             case "--mmproj":
47                 if (++i < cast(int) args.length) mmprojPath  = args[i];
48                 else return printUsage(args[0]);
49                 break;
50             case "-i":
51                 if (++i < cast(int) args.length) imagePath   = args[i];
52                 else return printUsage(args[0]);
53                 break;
54             case "-n":
55                 if (++i < cast(int) args.length) nPredict    = args[i].to!int;
56                 else return printUsage(args[0]);
57                 break;
58             case "-ngl":
59                 if (++i < cast(int) args.length) ngl         = args[i].to!int;
60                 else return printUsage(args[0]);
61                 break;
62             case "--no-gpu":
63                 useGpu = false;
64                 break;
65             default:
66                 prompt = args[i];
67                 break;
68         }
69     }
70 
71     if (modelPath.length == 0 || mmprojPath.length == 0)
72         return printUsage(args[0]);
73 
74     // ── Backend + model ──────────────────────────────────────────────────────
75     loadAllBackends();
76 
77     auto model = LlamaModel.loadFromFile(modelPath, ngl);
78     if (!model)
79     {
80         stderr.writeln("error: failed to load language model '", modelPath, "'");
81         return 1;
82     }
83 
84     // ── Multimodal projector ─────────────────────────────────────────────────
85     auto mparams          = mtmd_context_params_default();
86     mparams.use_gpu       = useGpu;
87     mparams.print_timings = true;
88 
89     auto mtmd = MtmdContext.initFromFile(mmprojPath, model.ptr, mparams);
90     if (!mtmd)
91     {
92         stderr.writeln("error: failed to load mmproj '", mmprojPath, "'");
93         return 1;
94     }
95 
96     // ── Build prompt with optional media marker ──────────────────────────────
97     bool   haveImage  = imagePath.length > 0;
98     string marker     = fromStringz(mtmd_default_marker()).idup;
99     string fullPrompt = haveImage ? marker ~ "\n" ~ prompt : prompt;
100 
101     // ── Tokenise ─────────────────────────────────────────────────────────────
102     auto chunks   = InputChunks.create();
103     auto inputTxt = mtmd_input_text(&fullPrompt[0], /*add_special=*/true,
104                                                     /*parse_special=*/true);
105 
106     if (haveImage)
107     {
108         // MtmdBitmap is non-copyable; it stays alive until after tokenize().
109         auto bitmapStore = mtmd.loadBitmap(imagePath);
110         if (!bitmapStore)
111         {
112             stderr.writeln("error: failed to load image '", imagePath, "'");
113             return 1;
114         }
115         const(mtmd_bitmap)*[1] bitmapPtrs = [bitmapStore.ptr];
116         if (auto err = mtmd.tokenize(chunks, inputTxt, bitmapPtrs[]))
117         {
118             stderr.writefln("error: mtmd tokenization failed (code %d)", err);
119             return 1;
120         }
121     }
122     else
123     {
124         if (auto err = mtmd.tokenize(chunks, inputTxt))
125         {
126             stderr.writefln("error: mtmd tokenization failed (code %d)", err);
127             return 1;
128         }
129     }
130 
131     // ── Context ──────────────────────────────────────────────────────────────
132     uint nCtxNeeded = cast(uint)(chunks.nTokens + nPredict);
133     auto ctx = LlamaContext.fromModel(model,
134                    nCtxNeeded,
135                    cast(uint) nBatch);
136     if (!ctx)
137     {
138         stderr.writeln("error: failed to create llama_context");
139         return 1;
140     }
141 
142     // ── Eval all chunks (text + image embeddings) ────────────────────────────
143     llama_pos nPast;
144     if (auto err = mtmd.evalChunks(ctx.ptr, chunks, 0, /*seq_id=*/0,
145                                    nBatch, /*logits_last=*/true, nPast))
146     {
147         stderr.writefln("error: chunk eval failed (code %d)", err);
148         return 1;
149     }
150 
151     // ── Generation ───────────────────────────────────────────────────────────
152     const vocab = model.vocab;
153 
154     auto smpl = SamplerChain.create();
155     smpl.temp(0.8f).topK(40).topP(0.95f).dist();
156 
157     llama_token[1] nextBuf;
158 
159     for (int gen = 0; gen < nPredict; gen++)
160     {
161         auto tok = smpl.sample(ctx);
162         if (isEog(vocab, tok)) break;
163 
164         write(tokenToString(vocab, tok));
165 
166         import std.stdio : stdout;
167         stdout.flush();
168 
169         smpl.accept(tok);
170         nextBuf[0] = tok;
171         auto batch = batchGetOne(nextBuf[]);
172         if (ctx.decode(batch))
173         {
174             stderr.writeln("error: decode failed");
175             break;
176         }
177         nPast++;
178     }
179 
180     writeln();
181 
182     ctx.printPerf();
183     smpl.printPerf();
184     writeln();
185 
186     return 0;
187 }
188 
189 // ────────────────────────────────────────────────────────────────────────────
190 
191 int printUsage(string prog) @trusted nothrow
192 {
193     printf(
194         "\nusage: %s -m model.gguf --mmproj mmproj.gguf\n" ~
195         "          [-i image.jpg] [-n n_predict] [-ngl n_gpu_layers]\n" ~
196         "          [--no-gpu] [prompt]\n\n" ~
197         "  -m       path to the GGUF language model\n" ~
198         "  --mmproj path to the multimodal projector GGUF\n" ~
199         "  -i       input image or audio file (optional)\n" ~
200         "  -n       number of tokens to predict (default: 512)\n" ~
201         "  -ngl     number of GPU layers (default: 99)\n" ~
202         "  --no-gpu run projector on CPU only\n" ~
203         "  prompt   text prompt (default: 'Describe the image in detail.')\n\n",
204         prog.ptr);
205     return 1;
206 }