1 /++ 2 Multimodal inference CLI — feed an image (and optional text) to a vision model. 3 4 Usage: 5 --- 6 multimodal -m model.gguf --mmproj mmproj.gguf \ 7 [-i image.jpg] [-n n_predict] [-ngl n_gpu_layers] [prompt] 8 --- 9 10 The language model and projector must be compatible (same architecture). 11 If no image is supplied the tool behaves like a plain text-completion CLI. 12 +/ 13 module multimodal; 14 15 import llama; 16 import llama.mtmd; 17 import std.stdio : write, writeln, writefln, stderr; 18 import std.conv : to; 19 import std.string : fromStringz; 20 import core.stdc.locale : setlocale, LC_NUMERIC; 21 import core.stdc.stdio : printf; 22 23 // ──────────────────────────────────────────────────────────────────────────── 24 25 int main(string[] args) @trusted 26 { 27 setlocale(LC_NUMERIC, "C"); 28 29 string modelPath; 30 string mmprojPath; 31 string imagePath; 32 string prompt = "Describe the image in detail."; 33 int ngl = 99; 34 int nPredict = 512; 35 int nBatch = 512; 36 bool useGpu = true; 37 38 for (int i = 1; i < cast(int) args.length; i++) 39 { 40 switch (args[i]) 41 { 42 case "-m": 43 if (++i < cast(int) args.length) modelPath = args[i]; 44 else return printUsage(args[0]); 45 break; 46 case "--mmproj": 47 if (++i < cast(int) args.length) mmprojPath = args[i]; 48 else return printUsage(args[0]); 49 break; 50 case "-i": 51 if (++i < cast(int) args.length) imagePath = args[i]; 52 else return printUsage(args[0]); 53 break; 54 case "-n": 55 if (++i < cast(int) args.length) nPredict = args[i].to!int; 56 else return printUsage(args[0]); 57 break; 58 case "-ngl": 59 if (++i < cast(int) args.length) ngl = args[i].to!int; 60 else return printUsage(args[0]); 61 break; 62 case "--no-gpu": 63 useGpu = false; 64 break; 65 default: 66 prompt = args[i]; 67 break; 68 } 69 } 70 71 if (modelPath.length == 0 || mmprojPath.length == 0) 72 return printUsage(args[0]); 73 74 // ── Backend + model ────────────────────────────────────────────────────── 75 loadAllBackends(); 76 77 auto model = LlamaModel.loadFromFile(modelPath, ngl); 78 if (!model) 79 { 80 stderr.writeln("error: failed to load language model '", modelPath, "'"); 81 return 1; 82 } 83 84 // ── Multimodal projector ───────────────────────────────────────────────── 85 auto mparams = mtmd_context_params_default(); 86 mparams.use_gpu = useGpu; 87 mparams.print_timings = true; 88 89 auto mtmd = MtmdContext.initFromFile(mmprojPath, model.ptr, mparams); 90 if (!mtmd) 91 { 92 stderr.writeln("error: failed to load mmproj '", mmprojPath, "'"); 93 return 1; 94 } 95 96 // ── Build prompt with optional media marker ────────────────────────────── 97 bool haveImage = imagePath.length > 0; 98 string marker = fromStringz(mtmd_default_marker()).idup; 99 string fullPrompt = haveImage ? marker ~ "\n" ~ prompt : prompt; 100 101 // ── Tokenise ───────────────────────────────────────────────────────────── 102 auto chunks = InputChunks.create(); 103 auto inputTxt = mtmd_input_text(&fullPrompt[0], /*add_special=*/true, 104 /*parse_special=*/true); 105 106 if (haveImage) 107 { 108 // MtmdBitmap is non-copyable; it stays alive until after tokenize(). 109 auto bitmapStore = mtmd.loadBitmap(imagePath); 110 if (!bitmapStore) 111 { 112 stderr.writeln("error: failed to load image '", imagePath, "'"); 113 return 1; 114 } 115 const(mtmd_bitmap)*[1] bitmapPtrs = [bitmapStore.ptr]; 116 if (auto err = mtmd.tokenize(chunks, inputTxt, bitmapPtrs[])) 117 { 118 stderr.writefln("error: mtmd tokenization failed (code %d)", err); 119 return 1; 120 } 121 } 122 else 123 { 124 if (auto err = mtmd.tokenize(chunks, inputTxt)) 125 { 126 stderr.writefln("error: mtmd tokenization failed (code %d)", err); 127 return 1; 128 } 129 } 130 131 // ── Context ────────────────────────────────────────────────────────────── 132 uint nCtxNeeded = cast(uint)(chunks.nTokens + nPredict); 133 auto ctx = LlamaContext.fromModel(model, 134 nCtxNeeded, 135 cast(uint) nBatch); 136 if (!ctx) 137 { 138 stderr.writeln("error: failed to create llama_context"); 139 return 1; 140 } 141 142 // ── Eval all chunks (text + image embeddings) ──────────────────────────── 143 llama_pos nPast; 144 if (auto err = mtmd.evalChunks(ctx.ptr, chunks, 0, /*seq_id=*/0, 145 nBatch, /*logits_last=*/true, nPast)) 146 { 147 stderr.writefln("error: chunk eval failed (code %d)", err); 148 return 1; 149 } 150 151 // ── Generation ─────────────────────────────────────────────────────────── 152 const vocab = model.vocab; 153 154 auto smpl = SamplerChain.create(); 155 smpl.temp(0.8f).topK(40).topP(0.95f).dist(); 156 157 llama_token[1] nextBuf; 158 159 for (int gen = 0; gen < nPredict; gen++) 160 { 161 auto tok = smpl.sample(ctx); 162 if (isEog(vocab, tok)) break; 163 164 write(tokenToString(vocab, tok)); 165 166 import std.stdio : stdout; 167 stdout.flush(); 168 169 smpl.accept(tok); 170 nextBuf[0] = tok; 171 auto batch = batchGetOne(nextBuf[]); 172 if (ctx.decode(batch)) 173 { 174 stderr.writeln("error: decode failed"); 175 break; 176 } 177 nPast++; 178 } 179 180 writeln(); 181 182 ctx.printPerf(); 183 smpl.printPerf(); 184 writeln(); 185 186 return 0; 187 } 188 189 // ──────────────────────────────────────────────────────────────────────────── 190 191 int printUsage(string prog) @trusted nothrow 192 { 193 printf( 194 "\nusage: %s -m model.gguf --mmproj mmproj.gguf\n" ~ 195 " [-i image.jpg] [-n n_predict] [-ngl n_gpu_layers]\n" ~ 196 " [--no-gpu] [prompt]\n\n" ~ 197 " -m path to the GGUF language model\n" ~ 198 " --mmproj path to the multimodal projector GGUF\n" ~ 199 " -i input image or audio file (optional)\n" ~ 200 " -n number of tokens to predict (default: 512)\n" ~ 201 " -ngl number of GPU layers (default: 99)\n" ~ 202 " --no-gpu run projector on CPU only\n" ~ 203 " prompt text prompt (default: 'Describe the image in detail.')\n\n", 204 prog.ptr); 205 return 1; 206 }