1 /++
2 D bindings and wrappers for libmtmd (multimodal support).
3 
4 libmtmd encodes images and audio into token embeddings that a language
5 model can attend to alongside ordinary text tokens.
6 
7 Typical usage:
8 ---
9 auto mtmd = MtmdContext.initFromFile("mmproj.gguf", model.ptr);
10 auto bmp  = mtmd.loadBitmap("photo.jpg");
11 auto chunks = InputChunks.create();
12 auto txt  = mtmd_input_text(&fullPrompt[0], true, true);
13 mtmd.tokenize(chunks, txt, [bmp.ptr]);
14 llama_pos nPast;
15 mtmd.evalChunks(ctx.ptr, chunks, 0, 0, 512, true, nPast);
16 // ... then sample as usual with SamplerChain
17 ---
18 +/
19 module llama.mtmd;
20 
21 import llama.llama; // llama_model, llama_context, llama_token, llama_pos, llama_seq_id
22 import llama.owned;
23 
24 // ── Callback aliases (matching ggml.h / ggml-backend.h) ────────────────────
25 /// Logging callback: `level` is `ggml_log_level` cast to int.
26 alias ggml_log_callback = extern(C) void function(int level, const(char)* text, void* user_data) nothrow;
27 /// Eval callback: return false to cancel.
28 alias ggml_backend_sched_eval_callback = extern(C) bool function(void* tensor, bool ask, void* user_data) nothrow;
29 
30 // ── Opaque C types ──────────────────────────────────────────────────────────
31 struct mtmd_context;
32 struct mtmd_bitmap;
33 struct mtmd_image_tokens;
34 struct mtmd_input_chunk;
35 struct mtmd_input_chunks;
36 
37 // ── Plain C structs ─────────────────────────────────────────────────────────
38 
39 /// Chunk type emitted by `mtmd_tokenize`.
40 enum mtmd_input_chunk_type : int
41 {
42     text  = 0,
43     image = 1,
44     audio = 2,
45 }
46 
47 /// Text input descriptor passed to `mtmd_tokenize`.
48 struct mtmd_input_text
49 {
50     const(char)* text;        ///< Prompt, may contain `mtmd_default_marker()`.
51     bool         add_special;  ///< Prepend BOS token if true.
52     bool         parse_special;///< Interpret special tokens in the prompt.
53 }
54 
55 /++
56 Parameters for `mtmd_init_from_file`.
57 Always initialise via `mtmd_context_params_default()` and then override fields.
58 +/
59 struct mtmd_context_params
60 {
61     bool  use_gpu;                           ///< Run the projector on GPU if available.
62     bool  print_timings;                     ///< Print encoder timing stats on free.
63     int   n_threads;                         ///< Number of CPU threads (0 = auto).
64     const(char)* image_marker;               ///< Deprecated; use `media_marker`.
65     const(char)* media_marker;               ///< Marker replaced by image/audio tokens (default: `<__media__>`).
66     int   flash_attn_type;                   ///< `llama_flash_attn_type` cast to int.
67     bool  warmup;                            ///< Run a warm-up encode pass after init.
68     int   image_min_tokens;                  ///< Dynamic-resolution lower bound (0 = from metadata).
69     int   image_max_tokens;                  ///< Dynamic-resolution upper bound (0 = from metadata).
70     ggml_backend_sched_eval_callback cb_eval;///< Optional eval callback.
71     void* cb_eval_user_data;                 ///< User data for `cb_eval`.
72 }
73 
74 // ── C API declarations ──────────────────────────────────────────────────────
75 extern(C) @nogc nothrow:
76 
77 /// Returns the default media marker string (`"<__media__>"`).
78 const(char)* mtmd_default_marker();
79 
80 /// Returns a default-initialised `mtmd_context_params`.
81 mtmd_context_params mtmd_context_params_default();
82 
83 /++
84 Initialises a multimodal context from a projector GGUF file.
85 Returns `null` on failure (bad path, incompatible model, etc.).
86 +/
87 mtmd_context* mtmd_init_from_file(
88     const(char)* mmproj_fname,
89     const(llama_model)* text_model,
90     mtmd_context_params ctx_params);
91 
92 /// Frees a multimodal context.
93 void mtmd_free(mtmd_context* ctx);
94 
95 /// True if the model requires a non-causal attention mask for llama_decode.
96 bool mtmd_decode_use_non_causal(mtmd_context* ctx);
97 
98 /// True if the model uses M-RoPE (Multimodal RoPE) for llama_decode.
99 bool mtmd_decode_use_mrope(mtmd_context* ctx);
100 
101 /// True if the model supports image input.
102 bool mtmd_support_vision(mtmd_context* ctx);
103 
104 /// True if the model supports audio input.
105 bool mtmd_support_audio(mtmd_context* ctx);
106 
107 /// Audio sample rate in Hz (e.g. 16 000 for Whisper), or -1 if unsupported.
108 int mtmd_get_audio_sample_rate(mtmd_context* ctx);
109 
110 // ── mtmd_bitmap ─────────────────────────────────────────────────────────────
111 
112 /// Create an image bitmap from raw RGB pixels (`RGBRGBRGB…`; length must equal `nx * ny * 3`).
113 mtmd_bitmap* mtmd_bitmap_init(uint nx, uint ny, const(ubyte)* data);
114 
115 /// Create an audio bitmap from float PCM samples.
116 mtmd_bitmap* mtmd_bitmap_init_from_audio(size_t n_samples, const(float)* data);
117 
118 uint          mtmd_bitmap_get_nx    (const(mtmd_bitmap)* bitmap);
119 uint          mtmd_bitmap_get_ny    (const(mtmd_bitmap)* bitmap);
120 const(ubyte)* mtmd_bitmap_get_data  (const(mtmd_bitmap)* bitmap);
121 size_t        mtmd_bitmap_get_n_bytes(const(mtmd_bitmap)* bitmap);
122 bool          mtmd_bitmap_is_audio  (const(mtmd_bitmap)* bitmap);
123 void          mtmd_bitmap_free      (mtmd_bitmap* bitmap);
124 
125 /// Optional string ID used for KV-cache tracking.
126 const(char)*  mtmd_bitmap_get_id(const(mtmd_bitmap)* bitmap);
127 /// Set the bitmap ID.
128 void          mtmd_bitmap_set_id(mtmd_bitmap* bitmap, const(char)* id);
129 
130 // ── mtmd_input_chunks ───────────────────────────────────────────────────────
131 
132 mtmd_input_chunks*       mtmd_input_chunks_init();
133 size_t                   mtmd_input_chunks_size(const(mtmd_input_chunks)* chunks);
134 const(mtmd_input_chunk)* mtmd_input_chunks_get (const(mtmd_input_chunks)* chunks, size_t idx);
135 void                     mtmd_input_chunks_free(mtmd_input_chunks* chunks);
136 
137 // ── mtmd_input_chunk ────────────────────────────────────────────────────────
138 
139 mtmd_input_chunk_type     mtmd_input_chunk_get_type        (const(mtmd_input_chunk)* chunk);
140 const(llama_token)*       mtmd_input_chunk_get_tokens_text (const(mtmd_input_chunk)* chunk, size_t* n_tokens_output);
141 const(mtmd_image_tokens)* mtmd_input_chunk_get_tokens_image(const(mtmd_input_chunk)* chunk);
142 size_t                    mtmd_input_chunk_get_n_tokens    (const(mtmd_input_chunk)* chunk);
143 const(char)*              mtmd_input_chunk_get_id          (const(mtmd_input_chunk)* chunk);
144 llama_pos                 mtmd_input_chunk_get_n_pos       (const(mtmd_input_chunk)* chunk);
145 mtmd_input_chunk*         mtmd_input_chunk_copy            (const(mtmd_input_chunk)* chunk);
146 void                      mtmd_input_chunk_free            (mtmd_input_chunk* chunk);
147 
148 // ── mtmd_image_tokens ───────────────────────────────────────────────────────
149 
150 size_t       mtmd_image_tokens_get_n_tokens(const(mtmd_image_tokens)* image_tokens);
151 size_t       mtmd_image_tokens_get_nx      (const(mtmd_image_tokens)* image_tokens);
152 size_t       mtmd_image_tokens_get_ny      (const(mtmd_image_tokens)* image_tokens);
153 const(char)* mtmd_image_tokens_get_id      (const(mtmd_image_tokens)* image_tokens);
154 llama_pos    mtmd_image_tokens_get_n_pos   (const(mtmd_image_tokens)* image_tokens);
155 
156 // ── Tokenise / encode ───────────────────────────────────────────────────────
157 
158 /++
159 Tokenise a text prompt that may contain media markers.
160 Returns 0 on success, 1 on bitmap-count mismatch, 2 on preprocessing error.
161 +/
162 int mtmd_tokenize(
163     mtmd_context*       ctx,
164     mtmd_input_chunks*  output,
165     const(mtmd_input_text)* text,
166     const(mtmd_bitmap*)* bitmaps,
167     size_t              n_bitmaps);
168 
169 /// Encode a single image/audio chunk.  Returns 0 on success.
170 int mtmd_encode_chunk(mtmd_context* ctx, const(mtmd_input_chunk)* chunk);
171 
172 /// Pointer to the float embeddings from the last `mtmd_encode_chunk` call.
173 float* mtmd_get_output_embd(mtmd_context* ctx);
174 
175 /// Set a logging callback.
176 void mtmd_log_set(ggml_log_callback log_callback, void* user_data);
177 
178 // ── mtmd-helper API ─────────────────────────────────────────────────────────
179 
180 /// Set logging callback (also calls `mtmd_log_set` internally).
181 void mtmd_helper_log_set(ggml_log_callback log_callback, void* user_data);
182 
183 /// Load an image or audio file into a bitmap.  Thread-safe.  Returns `null` on failure.
184 mtmd_bitmap* mtmd_helper_bitmap_init_from_file(mtmd_context* ctx, const(char)* fname);
185 
186 /// Load from an in-memory buffer (JPEG/PNG/BMP/GIF/WAV/MP3/FLAC).  Thread-safe.
187 mtmd_bitmap* mtmd_helper_bitmap_init_from_buf(mtmd_context* ctx, const(ubyte)* buf, size_t len);
188 
189 /// Total token count across all chunks (for KV-cache sizing).
190 size_t    mtmd_helper_get_n_tokens(const(mtmd_input_chunks)* chunks);
191 
192 /// Total position count across all chunks (may differ from n_tokens for M-RoPE).
193 llama_pos mtmd_helper_get_n_pos(const(mtmd_input_chunks)* chunks);
194 
195 /++
196 Eval all chunks: text via `llama_decode`, images via `mtmd_encode_chunk` + `llama_decode`.
197 Returns 0 on success. NOT thread-safe.
198 +/
199 int mtmd_helper_eval_chunks(
200     mtmd_context*             ctx,
201     llama_context*            lctx,
202     const(mtmd_input_chunks)* chunks,
203     llama_pos                 n_past,
204     llama_seq_id              seq_id,
205     int                       n_batch,
206     bool                      logits_last,
207     llama_pos*                new_n_past);
208 
209 /// Like `mtmd_helper_eval_chunks` but for a single chunk.
210 int mtmd_helper_eval_chunk_single(
211     mtmd_context*            ctx,
212     llama_context*           lctx,
213     const(mtmd_input_chunk)* chunk,
214     llama_pos                n_past,
215     llama_seq_id             seq_id,
216     int                      n_batch,
217     bool                     logits_last,
218     llama_pos*               new_n_past);
219 
220 /// Decode an already-encoded image chunk (embeddings pre-calculated).
221 int mtmd_helper_decode_image_chunk(
222     mtmd_context*            ctx,
223     llama_context*           lctx,
224     const(mtmd_input_chunk)* chunk,
225     float*                   encoded_embd,
226     llama_pos                n_past,
227     llama_seq_id             seq_id,
228     int                      n_batch,
229     llama_pos*               new_n_past);
230 
231 // ── D wrappers ──────────────────────────────────────────────────────────────
232 // Reset to D linkage — the extern(C) block above must not bleed through.
233 extern(D):
234 
235 /++
236 An image or audio bitmap loaded from a file or raw buffer.
237 Construct via `MtmdBitmap.fromRGB`, `MtmdBitmap.fromAudio`, or
238 `MtmdContext.loadBitmap`.
239 +/
240 struct MtmdBitmap
241 {
242     mixin Owned!(mtmd_bitmap, mtmd_bitmap_free);
243 
244     /++ Create from a raw RGB pixel buffer (`RGBRGBRGB…`). `data.length` must equal `nx * ny * 3`. +/
245     static MtmdBitmap fromRGB(uint nx, uint ny, scope const(ubyte)[] data) @trusted @nogc nothrow
246     in (data.length == cast(size_t) nx * ny * 3)
247     {
248         return MtmdBitmap(mtmd_bitmap_init(nx, ny, data.ptr));
249     }
250 
251     /// Create from float PCM audio samples (mono, any sample rate).
252     static MtmdBitmap fromAudio(scope const(float)[] samples) @trusted @nogc nothrow
253     {
254         return MtmdBitmap(mtmd_bitmap_init_from_audio(samples.length, samples.ptr));
255     }
256 
257     @property uint          nx()      const @nogc nothrow { return mtmd_bitmap_get_nx(_ptr); }
258     @property uint          ny()      const @nogc nothrow { return mtmd_bitmap_get_ny(_ptr); }
259     @property bool          isAudio() const @nogc nothrow { return mtmd_bitmap_is_audio(_ptr); }
260 
261     /// Raw pixel/sample bytes (read-only slice into C memory).
262     @property const(ubyte)[] data() const @trusted @nogc nothrow
263     {
264         return mtmd_bitmap_get_data(_ptr)[0 .. mtmd_bitmap_get_n_bytes(_ptr)];
265     }
266 
267     /// Optional KV-cache tracking ID.
268     const(char)* id()              const @nogc nothrow { return mtmd_bitmap_get_id(_ptr); }
269     void         setId(const(char)* s)   @nogc nothrow { mtmd_bitmap_set_id(_ptr, s); }
270 }
271 
272 // ────────────────────────────────────────────────────────────────────────────
273 
274 /++
275 A list of tokenized input chunks produced by `MtmdContext.tokenize`.
276 Supports `foreach` iteration over `const(mtmd_input_chunk)*` elements.
277 +/
278 struct InputChunks
279 {
280     mixin Owned!(mtmd_input_chunks, mtmd_input_chunks_free);
281 
282     /// Create an empty chunk list (to be filled by `MtmdContext.tokenize`).
283     static InputChunks create() @nogc nothrow
284     {
285         return InputChunks(mtmd_input_chunks_init());
286     }
287 
288     /// Number of chunks.
289     @property size_t length() const @nogc nothrow { return mtmd_input_chunks_size(_ptr); }
290     /// True when no chunks are present.
291     @property bool   empty()  const @nogc nothrow { return length == 0; }
292 
293     /// Index into the chunk list.
294     const(mtmd_input_chunk)* opIndex(size_t idx) const @nogc nothrow
295     {
296         return mtmd_input_chunks_get(_ptr, idx);
297     }
298 
299     /// `foreach (chunk; chunks)` — iterates each `const(mtmd_input_chunk)*`.
300     int opApply(scope int delegate(const(mtmd_input_chunk)*) dg) const
301     {
302         foreach (i; 0 .. length)
303             if (auto r = dg(this[i])) return r;
304         return 0;
305     }
306 
307     /// `foreach (i, chunk; chunks)` — indexed iteration.
308     int opApply(scope int delegate(size_t, const(mtmd_input_chunk)*) dg) const
309     {
310         foreach (i; 0 .. length)
311             if (auto r = dg(i, this[i])) return r;
312         return 0;
313     }
314 
315     /// Total token count across all chunks.
316     @property size_t    nTokens() const @nogc nothrow { return mtmd_helper_get_n_tokens(_ptr); }
317     /// Total position count (may differ from `nTokens` for M-RoPE models).
318     @property llama_pos nPos()    const @nogc nothrow { return mtmd_helper_get_n_pos(_ptr); }
319 }
320 
321 // ────────────────────────────────────────────────────────────────────────────
322 
323 /++
324 A multimodal projector context loaded from a GGUF file.
325 Encodes images and audio into embeddings for the paired language model.
326 Check `if (ctx)` after construction.
327 +/
328 struct MtmdContext
329 {
330     mixin Owned!(mtmd_context, mtmd_free);
331 
332     /// Load a projector from a GGUF file. Returns a falsy context on failure or null model.
333     static MtmdContext initFromFile(
334         string mmproj,
335         const(llama_model)* model,
336         mtmd_context_params params) @trusted nothrow
337     {
338         if (model is null) return MtmdContext(null);
339         import std.string : toStringz;
340         return MtmdContext(mtmd_init_from_file(mmproj.toStringz, model, params));
341     }
342 
343     /// Overload using default params.
344     static MtmdContext initFromFile(string mmproj, const(llama_model)* model) nothrow
345     {
346         auto p = mtmd_context_params_default();
347         return initFromFile(mmproj, model, p);
348     }
349 
350     @property bool supportsVision() @nogc nothrow { return mtmd_support_vision(_ptr); }
351     @property bool supportsAudio()  @nogc nothrow { return mtmd_support_audio(_ptr); }
352     @property bool useNonCausal()   @nogc nothrow { return mtmd_decode_use_non_causal(_ptr); }
353     @property bool useMrope()       @nogc nothrow { return mtmd_decode_use_mrope(_ptr); }
354     @property int  audioSampleRate()@nogc nothrow { return mtmd_get_audio_sample_rate(_ptr); }
355 
356     /// Load an image or audio file into an owned bitmap. Returns falsy bitmap on failure.
357     MtmdBitmap loadBitmap(string path) @trusted nothrow
358     {
359         import std.string : toStringz;
360         return MtmdBitmap(mtmd_helper_bitmap_init_from_file(_ptr, path.toStringz));
361     }
362 
363     /// Load a bitmap from an in-memory byte buffer.
364     MtmdBitmap loadBitmapFromBuf(const(ubyte)[] buf) @trusted @nogc nothrow
365     {
366         return MtmdBitmap(mtmd_helper_bitmap_init_from_buf(_ptr, buf.ptr, buf.length));
367     }
368 
369     /++
370     Tokenise a prompt string that contains `mtmd_default_marker()` placeholders.
371     `bitmaps` must have exactly as many entries as markers in `text.text`.
372     Returns 0 on success, 1 on count mismatch, 2 on preprocessing error.
373     +/
374     int tokenize(
375         ref InputChunks           output,
376         scope ref mtmd_input_text text,
377         const(mtmd_bitmap*)[]     bitmaps = null) @trusted @nogc nothrow
378     {
379         return mtmd_tokenize(_ptr, output.ptr, &text, bitmaps.ptr, bitmaps.length);
380     }
381 
382     /++
383     Evaluate all chunks against the language-model context.
384     Advances `newNPast` to the position after the last evaluated token.
385     Returns 0 on success.
386     +/
387     int evalChunks(
388         llama_context*        lctx,
389         ref const InputChunks chunks,
390         llama_pos             nPast,
391         llama_seq_id          seqId,
392         int                   nBatch,
393         bool                  logitsLast,
394         ref llama_pos         newNPast) @trusted @nogc nothrow
395     {
396         return mtmd_helper_eval_chunks(
397             _ptr, lctx, chunks.ptr,
398             nPast, seqId, nBatch, logitsLast, &newNPast);
399     }
400 
401     /// Encode a single image chunk. Returns 0 on success; the embedding pointer
402     /// is valid until the next encode call.
403     int encodeChunk(const(mtmd_input_chunk)* chunk) @nogc nothrow
404     {
405         return mtmd_encode_chunk(_ptr, chunk);
406     }
407 
408     /// Pointer to the most recently encoded embeddings.
409     float* outputEmbd() @nogc nothrow { return mtmd_get_output_embd(_ptr); }
410 }