MoDA (Experimental)
An alternative depth-aware attention + DeepSeek-MoE architecture.
Experimental architecture
Experimentalalternative trackMoDA (Mixture-of-Depths Attention) is a self-contained, experimental alternative model defined in open_mythos/moda.py. It is not the looped Recurrent Depth Transformer that defines the core OpenMythos brand — it does not use the Prelude / Recurrent / Coda loop, ACT halting, or the stable LTI injection. Instead it explores two orthogonal ideas: depth-aware attention with a per-layer depth KV cache, and a DeepSeek-style MoE FFN. Treat everything below as an illustrative simulation of that file.
How a MoDA block is built
Depth-aware attention
DeepSeek mixture-of-experts FFN
Implementation (authoritative)
open_mythos/moda.py · lines 671-821 · MoDAAttention
Depth-aware unified attention (experimental)
class MoDAAttention(nn.Module):
"""Mixture-of-Depths Attention — read side.
Each query jointly attends (single softmax) to:
* Sequence KVs at the current layer (causal GQA).
* Depth KVs from all preceding layers at the *same* token position.
Depth cache entries are written externally by :class:`MoDABlock` from
the full block output X_l^out (after the MoE FFN).
Args:
cfg: :class:`MoDAConfig` instance.
"""
def __init__(self, cfg: MoDAConfig) -> None:
"""Build the MoDA attention module.
Creates four projection matrices (Q, K, V, O) sized for GQA and
stores the attention scale and dropout rate.
Args:
cfg: Model configuration. Must satisfy
``n_heads_q % n_heads_kv == 0`` (GQA requirement).
Raises:
ValueError: If ``n_heads_q`` is not divisible by ``n_heads_kv``.
"""
super().__init__()
if cfg.n_heads_q % cfg.n_heads_kv != 0:
raise ValueError(
f"n_heads_q ({cfg.n_heads_q}) must be divisible by "
f"n_heads_kv ({cfg.n_heads_kv}) for GQA."
)
self.n_heads_q = cfg.n_heads_q
self.n_heads_kv = cfg.n_heads_kv
self.head_dim = cfg.head_dim
self.gqa_group = cfg.n_heads_q // cfg.n_heads_kv
self.scale = cfg.head_dim**-0.5
self.dropout = cfg.attn_dropout
inner_q = cfg.n_heads_q * cfg.head_dim
inner_kv = cfg.n_heads_kv * cfg.head_dim
self.q_proj = nn.Linear(cfg.d_model, inner_q, bias=False)
self.k_proj = nn.Linear(cfg.d_model, inner_kv, bias=False)
self.v_proj = nn.Linear(cfg.d_model, inner_kv, bias=False)
self.o_proj = nn.Linear(inner_q, cfg.d_model, bias=False)
def _expand_kv(self, kv: torch.Tensor) -> torch.Tensor:
"""Repeat KV heads along dim 1 to match the number of query heads.
With GQA group size G, each KV head is shared by G query heads.
``repeat_interleave(G, dim=1)`` produces the correct interleaved
expansion so that query head ``h`` is paired with KV head ``h // G``.
Args:
kv: Key or value tensor whose dim 1 is the KV-head axis.
Supported shapes: ``[B, Hk, T, d]`` (sequence) and
``[B, Hk, T, L, d]`` (depth stack).
Returns:
Tensor with dim 1 expanded from ``Hk`` to ``Hq = Hk × G``.
Returns *kv* unchanged when ``gqa_group == 1``.
"""
if self.gqa_group == 1:
return kv
return kv.repeat_interleave(self.gqa_group, dim=1)
def forward(
self,
x: torch.Tensor,
depth_k_cache: List[torch.Tensor],
depth_v_cache: List[torch.Tensor],
cos: torch.Tensor,
sin: torch.Tensor,
) -> torch.Tensor:
"""Compute MoDA attention output.
Args:
x: ``[B, T, D]`` input hidden states.
depth_k_cache: ``L`` tensors each ``[B, Hk, T, d]`` — depth keys.
depth_v_cache: Matching depth values.
cos/sin: RoPE tables ``[1, 1, T, d]``.
Returns:
``[B, T, D]`` output hidden states.
"""
B, T, D = x.shape
Hq, Hk, d = self.n_heads_q, self.n_heads_kv, self.head_dim
Q = self.q_proj(x).view(B, T, Hq, d).transpose(1, 2)
K = self.k_proj(x).view(B, T, Hk, d).transpose(1, 2)
V = self.v_proj(x).view(B, T, Hk, d).transpose(1, 2)
Q = apply_rotary_emb(Q, cos, sin)
K = apply_rotary_emb(K, cos, sin)
K_e = self._expand_kv(K)
V_e = self._expand_kv(V)
L = len(depth_k_cache)
if L == 0:
out = F.scaled_dot_product_attention(
Q,
K_e,
V_e,
is_causal=True,
dropout_p=self.dropout if self.training else 0.0,
scale=self.scale,
)
else:
# Sequence logits [B, Hq, T, T] with causal mask
seq_logits = torch.matmul(Q, K_e.transpose(-2, -1)) * self.scale
causal_mask = torch.triu(
torch.full((T, T), float("-inf"), device=x.device, dtype=Q.dtype),
diagonal=1,
)
seq_logits = seq_logits + causal_mask
# Depth KVs: [B, Hk, L, T, d] → [B, Hk, T, L, d]
K_depth = torch.stack(depth_k_cache, dim=2).permute(0, 1, 3, 2, 4)
V_depth = torch.stack(depth_v_cache, dim=2).permute(0, 1, 3, 2, 4)
K_depth_e = self._expand_kv(K_depth)
V_depth_e = self._expand_kv(V_depth)
# Depth logits [B, Hq, T, L]
depth_logits = torch.einsum("bhid,bhild->bhil", Q, K_depth_e) * self.scale
# Unified softmax over T + L positions
combined = torch.cat([seq_logits, depth_logits], dim=-1)
weights = F.softmax(combined, dim=-1)
if self.training and self.dropout > 0.0:
weights = F.dropout(weights, p=self.dropout)
seq_contrib = torch.matmul(weights[:, :, :, :T], V_e)
depth_contrib = torch.einsum(
"bhil,bhild->bhid", weights[:, :, :, T:], V_depth_e
)
out = seq_contrib + depth_contrib
out = out.transpose(1, 2).reshape(B, T, Hq * d)
return self.o_proj(out)
# ---------------------------------------------------------------------------
# MoDA Transformer Block
# ---------------------------------------------------------------------------
The unified softmax lives in the L > 0 branch: sequence and depth logits are concatenated, then a single F.softmax normalises both.
All diagrams above are seeded, illustrative simulations. The code panels are the source of truth — see open_mythos/moda.py.