1 /++ 2 D bindings and wrappers for libmtmd (multimodal support). 3 4 libmtmd encodes images and audio into token embeddings that a language 5 model can attend to alongside ordinary text tokens. 6 7 Typical usage: 8 --- 9 auto mtmd = MtmdContext.initFromFile("mmproj.gguf", model.ptr); 10 auto bmp = mtmd.loadBitmap("photo.jpg"); 11 auto chunks = InputChunks.create(); 12 auto txt = mtmd_input_text(&fullPrompt[0], true, true); 13 mtmd.tokenize(chunks, txt, [bmp.ptr]); 14 llama_pos nPast; 15 mtmd.evalChunks(ctx.ptr, chunks, 0, 0, 512, true, nPast); 16 // ... then sample as usual with SamplerChain 17 --- 18 +/ 19 module llama.mtmd; 20 21 import llama.llama; // llama_model, llama_context, llama_token, llama_pos, llama_seq_id 22 import llama.owned; 23 24 // ── Callback aliases (matching ggml.h / ggml-backend.h) ──────────────────── 25 /// Logging callback: `level` is `ggml_log_level` cast to int. 26 alias ggml_log_callback = extern(C) void function(int level, const(char)* text, void* user_data) nothrow; 27 /// Eval callback: return false to cancel. 28 alias ggml_backend_sched_eval_callback = extern(C) bool function(void* tensor, bool ask, void* user_data) nothrow; 29 30 // ── Opaque C types ────────────────────────────────────────────────────────── 31 struct mtmd_context; 32 struct mtmd_bitmap; 33 struct mtmd_image_tokens; 34 struct mtmd_input_chunk; 35 struct mtmd_input_chunks; 36 37 // ── Plain C structs ───────────────────────────────────────────────────────── 38 39 /// Chunk type emitted by `mtmd_tokenize`. 40 enum mtmd_input_chunk_type : int 41 { 42 text = 0, 43 image = 1, 44 audio = 2, 45 } 46 47 /// Text input descriptor passed to `mtmd_tokenize`. 48 struct mtmd_input_text 49 { 50 const(char)* text; ///< Prompt, may contain `mtmd_default_marker()`. 51 bool add_special; ///< Prepend BOS token if true. 52 bool parse_special;///< Interpret special tokens in the prompt. 53 } 54 55 /++ 56 Parameters for `mtmd_init_from_file`. 57 Always initialise via `mtmd_context_params_default()` and then override fields. 58 +/ 59 struct mtmd_context_params 60 { 61 bool use_gpu; ///< Run the projector on GPU if available. 62 bool print_timings; ///< Print encoder timing stats on free. 63 int n_threads; ///< Number of CPU threads (0 = auto). 64 const(char)* image_marker; ///< Deprecated; use `media_marker`. 65 const(char)* media_marker; ///< Marker replaced by image/audio tokens (default: `<__media__>`). 66 int flash_attn_type; ///< `llama_flash_attn_type` cast to int. 67 bool warmup; ///< Run a warm-up encode pass after init. 68 int image_min_tokens; ///< Dynamic-resolution lower bound (0 = from metadata). 69 int image_max_tokens; ///< Dynamic-resolution upper bound (0 = from metadata). 70 ggml_backend_sched_eval_callback cb_eval;///< Optional eval callback. 71 void* cb_eval_user_data; ///< User data for `cb_eval`. 72 } 73 74 // ── C API declarations ────────────────────────────────────────────────────── 75 extern(C) @nogc nothrow: 76 77 /// Returns the default media marker string (`"<__media__>"`). 78 const(char)* mtmd_default_marker(); 79 80 /// Returns a default-initialised `mtmd_context_params`. 81 mtmd_context_params mtmd_context_params_default(); 82 83 /++ 84 Initialises a multimodal context from a projector GGUF file. 85 Returns `null` on failure (bad path, incompatible model, etc.). 86 +/ 87 mtmd_context* mtmd_init_from_file( 88 const(char)* mmproj_fname, 89 const(llama_model)* text_model, 90 mtmd_context_params ctx_params); 91 92 /// Frees a multimodal context. 93 void mtmd_free(mtmd_context* ctx); 94 95 /// True if the model requires a non-causal attention mask for llama_decode. 96 bool mtmd_decode_use_non_causal(mtmd_context* ctx); 97 98 /// True if the model uses M-RoPE (Multimodal RoPE) for llama_decode. 99 bool mtmd_decode_use_mrope(mtmd_context* ctx); 100 101 /// True if the model supports image input. 102 bool mtmd_support_vision(mtmd_context* ctx); 103 104 /// True if the model supports audio input. 105 bool mtmd_support_audio(mtmd_context* ctx); 106 107 /// Audio sample rate in Hz (e.g. 16 000 for Whisper), or -1 if unsupported. 108 int mtmd_get_audio_sample_rate(mtmd_context* ctx); 109 110 // ── mtmd_bitmap ───────────────────────────────────────────────────────────── 111 112 /// Create an image bitmap from raw RGB pixels (`RGBRGBRGB…`; length must equal `nx * ny * 3`). 113 mtmd_bitmap* mtmd_bitmap_init(uint nx, uint ny, const(ubyte)* data); 114 115 /// Create an audio bitmap from float PCM samples. 116 mtmd_bitmap* mtmd_bitmap_init_from_audio(size_t n_samples, const(float)* data); 117 118 uint mtmd_bitmap_get_nx (const(mtmd_bitmap)* bitmap); 119 uint mtmd_bitmap_get_ny (const(mtmd_bitmap)* bitmap); 120 const(ubyte)* mtmd_bitmap_get_data (const(mtmd_bitmap)* bitmap); 121 size_t mtmd_bitmap_get_n_bytes(const(mtmd_bitmap)* bitmap); 122 bool mtmd_bitmap_is_audio (const(mtmd_bitmap)* bitmap); 123 void mtmd_bitmap_free (mtmd_bitmap* bitmap); 124 125 /// Optional string ID used for KV-cache tracking. 126 const(char)* mtmd_bitmap_get_id(const(mtmd_bitmap)* bitmap); 127 /// Set the bitmap ID. 128 void mtmd_bitmap_set_id(mtmd_bitmap* bitmap, const(char)* id); 129 130 // ── mtmd_input_chunks ─────────────────────────────────────────────────────── 131 132 mtmd_input_chunks* mtmd_input_chunks_init(); 133 size_t mtmd_input_chunks_size(const(mtmd_input_chunks)* chunks); 134 const(mtmd_input_chunk)* mtmd_input_chunks_get (const(mtmd_input_chunks)* chunks, size_t idx); 135 void mtmd_input_chunks_free(mtmd_input_chunks* chunks); 136 137 // ── mtmd_input_chunk ──────────────────────────────────────────────────────── 138 139 mtmd_input_chunk_type mtmd_input_chunk_get_type (const(mtmd_input_chunk)* chunk); 140 const(llama_token)* mtmd_input_chunk_get_tokens_text (const(mtmd_input_chunk)* chunk, size_t* n_tokens_output); 141 const(mtmd_image_tokens)* mtmd_input_chunk_get_tokens_image(const(mtmd_input_chunk)* chunk); 142 size_t mtmd_input_chunk_get_n_tokens (const(mtmd_input_chunk)* chunk); 143 const(char)* mtmd_input_chunk_get_id (const(mtmd_input_chunk)* chunk); 144 llama_pos mtmd_input_chunk_get_n_pos (const(mtmd_input_chunk)* chunk); 145 mtmd_input_chunk* mtmd_input_chunk_copy (const(mtmd_input_chunk)* chunk); 146 void mtmd_input_chunk_free (mtmd_input_chunk* chunk); 147 148 // ── mtmd_image_tokens ─────────────────────────────────────────────────────── 149 150 size_t mtmd_image_tokens_get_n_tokens(const(mtmd_image_tokens)* image_tokens); 151 size_t mtmd_image_tokens_get_nx (const(mtmd_image_tokens)* image_tokens); 152 size_t mtmd_image_tokens_get_ny (const(mtmd_image_tokens)* image_tokens); 153 const(char)* mtmd_image_tokens_get_id (const(mtmd_image_tokens)* image_tokens); 154 llama_pos mtmd_image_tokens_get_n_pos (const(mtmd_image_tokens)* image_tokens); 155 156 // ── Tokenise / encode ─────────────────────────────────────────────────────── 157 158 /++ 159 Tokenise a text prompt that may contain media markers. 160 Returns 0 on success, 1 on bitmap-count mismatch, 2 on preprocessing error. 161 +/ 162 int mtmd_tokenize( 163 mtmd_context* ctx, 164 mtmd_input_chunks* output, 165 const(mtmd_input_text)* text, 166 const(mtmd_bitmap*)* bitmaps, 167 size_t n_bitmaps); 168 169 /// Encode a single image/audio chunk. Returns 0 on success. 170 int mtmd_encode_chunk(mtmd_context* ctx, const(mtmd_input_chunk)* chunk); 171 172 /// Pointer to the float embeddings from the last `mtmd_encode_chunk` call. 173 float* mtmd_get_output_embd(mtmd_context* ctx); 174 175 /// Set a logging callback. 176 void mtmd_log_set(ggml_log_callback log_callback, void* user_data); 177 178 // ── mtmd-helper API ───────────────────────────────────────────────────────── 179 180 /// Set logging callback (also calls `mtmd_log_set` internally). 181 void mtmd_helper_log_set(ggml_log_callback log_callback, void* user_data); 182 183 /// Load an image or audio file into a bitmap. Thread-safe. Returns `null` on failure. 184 mtmd_bitmap* mtmd_helper_bitmap_init_from_file(mtmd_context* ctx, const(char)* fname); 185 186 /// Load from an in-memory buffer (JPEG/PNG/BMP/GIF/WAV/MP3/FLAC). Thread-safe. 187 mtmd_bitmap* mtmd_helper_bitmap_init_from_buf(mtmd_context* ctx, const(ubyte)* buf, size_t len); 188 189 /// Total token count across all chunks (for KV-cache sizing). 190 size_t mtmd_helper_get_n_tokens(const(mtmd_input_chunks)* chunks); 191 192 /// Total position count across all chunks (may differ from n_tokens for M-RoPE). 193 llama_pos mtmd_helper_get_n_pos(const(mtmd_input_chunks)* chunks); 194 195 /++ 196 Eval all chunks: text via `llama_decode`, images via `mtmd_encode_chunk` + `llama_decode`. 197 Returns 0 on success. NOT thread-safe. 198 +/ 199 int mtmd_helper_eval_chunks( 200 mtmd_context* ctx, 201 llama_context* lctx, 202 const(mtmd_input_chunks)* chunks, 203 llama_pos n_past, 204 llama_seq_id seq_id, 205 int n_batch, 206 bool logits_last, 207 llama_pos* new_n_past); 208 209 /// Like `mtmd_helper_eval_chunks` but for a single chunk. 210 int mtmd_helper_eval_chunk_single( 211 mtmd_context* ctx, 212 llama_context* lctx, 213 const(mtmd_input_chunk)* chunk, 214 llama_pos n_past, 215 llama_seq_id seq_id, 216 int n_batch, 217 bool logits_last, 218 llama_pos* new_n_past); 219 220 /// Decode an already-encoded image chunk (embeddings pre-calculated). 221 int mtmd_helper_decode_image_chunk( 222 mtmd_context* ctx, 223 llama_context* lctx, 224 const(mtmd_input_chunk)* chunk, 225 float* encoded_embd, 226 llama_pos n_past, 227 llama_seq_id seq_id, 228 int n_batch, 229 llama_pos* new_n_past); 230 231 // ── D wrappers ────────────────────────────────────────────────────────────── 232 // Reset to D linkage — the extern(C) block above must not bleed through. 233 extern(D): 234 235 /++ 236 An image or audio bitmap loaded from a file or raw buffer. 237 Construct via `MtmdBitmap.fromRGB`, `MtmdBitmap.fromAudio`, or 238 `MtmdContext.loadBitmap`. 239 +/ 240 struct MtmdBitmap 241 { 242 mixin Owned!(mtmd_bitmap, mtmd_bitmap_free); 243 244 /++ Create from a raw RGB pixel buffer (`RGBRGBRGB…`). `data.length` must equal `nx * ny * 3`. +/ 245 static MtmdBitmap fromRGB(uint nx, uint ny, scope const(ubyte)[] data) @trusted @nogc nothrow 246 in (data.length == cast(size_t) nx * ny * 3) 247 { 248 return MtmdBitmap(mtmd_bitmap_init(nx, ny, data.ptr)); 249 } 250 251 /// Create from float PCM audio samples (mono, any sample rate). 252 static MtmdBitmap fromAudio(scope const(float)[] samples) @trusted @nogc nothrow 253 { 254 return MtmdBitmap(mtmd_bitmap_init_from_audio(samples.length, samples.ptr)); 255 } 256 257 @property uint nx() const @nogc nothrow { return mtmd_bitmap_get_nx(_ptr); } 258 @property uint ny() const @nogc nothrow { return mtmd_bitmap_get_ny(_ptr); } 259 @property bool isAudio() const @nogc nothrow { return mtmd_bitmap_is_audio(_ptr); } 260 261 /// Raw pixel/sample bytes (read-only slice into C memory). 262 @property const(ubyte)[] data() const @trusted @nogc nothrow 263 { 264 return mtmd_bitmap_get_data(_ptr)[0 .. mtmd_bitmap_get_n_bytes(_ptr)]; 265 } 266 267 /// Optional KV-cache tracking ID. 268 const(char)* id() const @nogc nothrow { return mtmd_bitmap_get_id(_ptr); } 269 void setId(const(char)* s) @nogc nothrow { mtmd_bitmap_set_id(_ptr, s); } 270 } 271 272 // ──────────────────────────────────────────────────────────────────────────── 273 274 /++ 275 A list of tokenized input chunks produced by `MtmdContext.tokenize`. 276 Supports `foreach` iteration over `const(mtmd_input_chunk)*` elements. 277 +/ 278 struct InputChunks 279 { 280 mixin Owned!(mtmd_input_chunks, mtmd_input_chunks_free); 281 282 /// Create an empty chunk list (to be filled by `MtmdContext.tokenize`). 283 static InputChunks create() @nogc nothrow 284 { 285 return InputChunks(mtmd_input_chunks_init()); 286 } 287 288 /// Number of chunks. 289 @property size_t length() const @nogc nothrow { return mtmd_input_chunks_size(_ptr); } 290 /// True when no chunks are present. 291 @property bool empty() const @nogc nothrow { return length == 0; } 292 293 /// Index into the chunk list. 294 const(mtmd_input_chunk)* opIndex(size_t idx) const @nogc nothrow 295 { 296 return mtmd_input_chunks_get(_ptr, idx); 297 } 298 299 /// `foreach (chunk; chunks)` — iterates each `const(mtmd_input_chunk)*`. 300 int opApply(scope int delegate(const(mtmd_input_chunk)*) dg) const 301 { 302 foreach (i; 0 .. length) 303 if (auto r = dg(this[i])) return r; 304 return 0; 305 } 306 307 /// `foreach (i, chunk; chunks)` — indexed iteration. 308 int opApply(scope int delegate(size_t, const(mtmd_input_chunk)*) dg) const 309 { 310 foreach (i; 0 .. length) 311 if (auto r = dg(i, this[i])) return r; 312 return 0; 313 } 314 315 /// Total token count across all chunks. 316 @property size_t nTokens() const @nogc nothrow { return mtmd_helper_get_n_tokens(_ptr); } 317 /// Total position count (may differ from `nTokens` for M-RoPE models). 318 @property llama_pos nPos() const @nogc nothrow { return mtmd_helper_get_n_pos(_ptr); } 319 } 320 321 // ──────────────────────────────────────────────────────────────────────────── 322 323 /++ 324 A multimodal projector context loaded from a GGUF file. 325 Encodes images and audio into embeddings for the paired language model. 326 Check `if (ctx)` after construction. 327 +/ 328 struct MtmdContext 329 { 330 mixin Owned!(mtmd_context, mtmd_free); 331 332 /// Load a projector from a GGUF file. Returns a falsy context on failure or null model. 333 static MtmdContext initFromFile( 334 string mmproj, 335 const(llama_model)* model, 336 mtmd_context_params params) @trusted nothrow 337 { 338 if (model is null) return MtmdContext(null); 339 import std.string : toStringz; 340 return MtmdContext(mtmd_init_from_file(mmproj.toStringz, model, params)); 341 } 342 343 /// Overload using default params. 344 static MtmdContext initFromFile(string mmproj, const(llama_model)* model) nothrow 345 { 346 auto p = mtmd_context_params_default(); 347 return initFromFile(mmproj, model, p); 348 } 349 350 @property bool supportsVision() @nogc nothrow { return mtmd_support_vision(_ptr); } 351 @property bool supportsAudio() @nogc nothrow { return mtmd_support_audio(_ptr); } 352 @property bool useNonCausal() @nogc nothrow { return mtmd_decode_use_non_causal(_ptr); } 353 @property bool useMrope() @nogc nothrow { return mtmd_decode_use_mrope(_ptr); } 354 @property int audioSampleRate()@nogc nothrow { return mtmd_get_audio_sample_rate(_ptr); } 355 356 /// Load an image or audio file into an owned bitmap. Returns falsy bitmap on failure. 357 MtmdBitmap loadBitmap(string path) @trusted nothrow 358 { 359 import std.string : toStringz; 360 return MtmdBitmap(mtmd_helper_bitmap_init_from_file(_ptr, path.toStringz)); 361 } 362 363 /// Load a bitmap from an in-memory byte buffer. 364 MtmdBitmap loadBitmapFromBuf(const(ubyte)[] buf) @trusted @nogc nothrow 365 { 366 return MtmdBitmap(mtmd_helper_bitmap_init_from_buf(_ptr, buf.ptr, buf.length)); 367 } 368 369 /++ 370 Tokenise a prompt string that contains `mtmd_default_marker()` placeholders. 371 `bitmaps` must have exactly as many entries as markers in `text.text`. 372 Returns 0 on success, 1 on count mismatch, 2 on preprocessing error. 373 +/ 374 int tokenize( 375 ref InputChunks output, 376 scope ref mtmd_input_text text, 377 const(mtmd_bitmap*)[] bitmaps = null) @trusted @nogc nothrow 378 { 379 return mtmd_tokenize(_ptr, output.ptr, &text, bitmaps.ptr, bitmaps.length); 380 } 381 382 /++ 383 Evaluate all chunks against the language-model context. 384 Advances `newNPast` to the position after the last evaluated token. 385 Returns 0 on success. 386 +/ 387 int evalChunks( 388 llama_context* lctx, 389 ref const InputChunks chunks, 390 llama_pos nPast, 391 llama_seq_id seqId, 392 int nBatch, 393 bool logitsLast, 394 ref llama_pos newNPast) @trusted @nogc nothrow 395 { 396 return mtmd_helper_eval_chunks( 397 _ptr, lctx, chunks.ptr, 398 nPast, seqId, nBatch, logitsLast, &newNPast); 399 } 400 401 /// Encode a single image chunk. Returns 0 on success; the embedding pointer 402 /// is valid until the next encode call. 403 int encodeChunk(const(mtmd_input_chunk)* chunk) @nogc nothrow 404 { 405 return mtmd_encode_chunk(_ptr, chunk); 406 } 407 408 /// Pointer to the most recently encoded embeddings. 409 float* outputEmbd() @nogc nothrow { return mtmd_get_output_embd(_ptr); } 410 }