ml-gde
diff --git a/‎jflux/cli.py
+50-83 b/‎jflux/cli.py
+50-83
diff --git a/‎jflux/math.py
+1-2 b/‎jflux/math.py
+1-2
diff --git a/‎jflux/model.py
+26-9 b/‎jflux/model.py
+26-9
diff --git a/‎jflux/modules/autoencoder.py
+15-2 b/‎jflux/modules/autoencoder.py
+15-2
diff --git a/‎jflux/modules/conditioner.py
+17-16 b/‎jflux/modules/conditioner.py
+17-16
@@ -6,9 +6,13 @@
 
 import jax
 import jax.numpy as jnp
+from flax import nnx
 from fire import Fire
 from jax.typing import DTypeLike
 
+from PIL import Image
+
+from einops import rearrange
 from jflux.sampling import denoise, get_noise, get_schedule, prepare, unpack
 from jflux.util import configs, load_ae, load_clip, load_flow_model, load_t5
 
@@ -101,51 +105,35 @@ def main(
         "a photo of a forest with mist swirling around the tree trunks. The word "
         '"FLUX" is painted over it in big, red brush strokes with visible texture'
     ),
-    device: str = "gpu" if jax.device_get("gpu") else "cpu",
     num_steps: int | None = None,
     loop: bool = False,
     guidance: float = 3.5,
-    # TODO: JAX variant of offloading to CPU
     offload: bool = False,
     output_dir: str = "output",
-    dtype: DTypeLike = jax.dtypes.bfloat16,
-    param_dtype: DTypeLike = None,
-) -> None:
+    add_sampling_metadata: bool = True,
+):
     """
-    Sample the flux model.
+    Sample the flux model. Either interactively (set `--loop`) or run for a
+    single image.
 
     Args:
-        name(str): Name of the model to use. Choose from 'flux-schnell' or 'flux-dev'.
-        width(int): Width of the generated image.
-        height(int): Height of the generated image.
-        seed(int, optional): Seed for the random number generator.
-        prompt(str): Text prompt to generate the image from.
-        device(str): Device to run the model on. Choose from 'cpu' or 'gpu'.
-        num_steps(int, optional): Number of steps to run the model for.
-        loop(bool): Whether to loop the sampling process.
-        guidance(float, optional): Guidance for the model, defaults to 3.5.
-        offload(bool, optional): Whether to offload the model to CPU, defaults to False.
-        output_dir(str, optional): Directory to save the output images in, defaults to 'output'.
-        dtype(DTypeLike, optional): Data type for the model, defaults to jax.dtypes.bfloat16.
-        param_dtype(DTypeLike, optional): Data type for the model parameters, defaults to None.
+        name: Name of the model to load
+        height: height of the sample in pixels (should be a multiple of 16)
+        width: width of the sample in pixels (should be a multiple of 16)
+        seed: Set a seed for sampling
+        output_name: where to save the output image, `{idx}` will be replaced
+            by the index of the sample
+        prompt: Prompt used for sampling
+        device: Pytorch device
+        num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
+        loop: start an interactive session and sample multiple times
+        guidance: guidance value used for guidance distillation
+        add_sampling_metadata: Add the prompt to the image Exif metadata
     """
-
-    if param_dtype is None:
-        param_dtype = dtype
-
     if name not in configs:
         available = ", ".join(configs.keys())
         raise ValueError(f"Got unknown model name: {name}, chose from {available}")
 
-    jax_device = jax.devices(device)
-    if len(jax_device) == 1:
-        jax_device = jax_device[0]
-    else:
-        # TODO (ariG23498)
-        # this will be when there are more than
-        # one devices to work on
-        pass
-
     if num_steps is None:
         num_steps = 4 if name == "flux-schnell" else 50
 
@@ -169,26 +157,11 @@ def main(
             idx = 0
 
     # init all components
-    import sys
-
-    sys.exit(0)
-    t5 = load_t5(max_length=256 if name == "flux-schnell" else 512)
+    t5 = load_t5()
     clip = load_clip()
-    model = load_flow_model(
-        name,
-        device="cpu" if offload else jax_device,
-        dtype=dtype,
-        param_dtype=param_dtype,
-    )
-    ae = load_ae(
-        name,
-        device="cpu" if offload else jax_device,
-        dtype=dtype,
-        param_dtype=param_dtype,
-    )
+    model = load_flow_model(name)
+    ae = load_ae(name)
 
-    # TODO (ariG23498)
-    # rngs = nnx.Rngs(0)
     opts = SamplingOptions(
         prompt=prompt,
         width=width,
@@ -200,57 +173,51 @@ def main(
 
     while opts is not None:
         if opts.seed is None:
-            # TODO (ariG23498)
-            # set the rng seed
-            # opts.seed = rng.seed()
-            pass
+            opts.seed = jax.random.PRNGKey(seed=42)
         print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
         t0 = time.perf_counter()
 
         # prepare input
         x = get_noise(
-            1,
-            opts.height,
-            opts.width,
-            device=jax_device,
+            num_samples=1,
+            height=opts.height,
+            width=opts.width,
             dtype=jax.dtypes.bfloat16,
-            seed=opts.seed,  # type: ignore
+            seed=opts.seed,
         )
         opts.seed = None
-        # TODO: JAX variant of offloading to CPU
-        # if offload:
-        #     ae = ae.cpu()
-        #     torch.cuda.empty_cache()
-        #     t5, clip = t5.to(torch_device), clip.to(torch_device)
-        inp = prepare(t5, clip, img=x, prompt=opts.prompt, device=jax_device)
+
+        inp = prepare(t5=t5, clip=clip, img=x, prompt=opts.prompt)
         timesteps = get_schedule(
-            opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell")
+            num_steps=opts.num_steps,
+            image_seq_len=inp["img"].shape[1],
+            shift=(name != "flux-schnell"),
         )
 
-        # offload TEs to CPU, load model to gpu
-        # TODO: JAX variant of offloading to CPU
-        # if offload:
-        #     t5, clip = t5.cpu(), clip.cpu()
-        #     torch.cuda.empty_cache()
-        #     model = model.to(torch_device)
-
         # denoise initial noise
-        x = denoise(model, **inp, timesteps=timesteps, guidance=opts.guidance)
-
-        # offload model, load autoencoder to gpu
-        # TODO: JAX variant of offloading to CPU
-        # if offload:
-        #     model.cpu()
-        #     torch.cuda.empty_cache()
-        #     ae.decoder.to(x.device)
+        x = denoise(
+            model=model,
+            img=inp["img"],
+            img_ids=inp["img_ids"],
+            txt=inp["txt"],
+            txt_ids=inp["txt_ids"],
+            vec=inp["vec"],
+            timesteps=timesteps,
+            guidance=opts.guidance,
+        )
 
         # decode latents to pixel space
-        x = unpack(x.astype(jnp.float32), opts.height, opts.width)
-        x = ae.decode(x).astype(dtype=jax.dtypes.bfloat16)  # noqa
+        x = unpack(x=x.astype(jnp.float32), height=opts.height, width=opts.width)
+        x = ae.decode(x)
         t1 = time.perf_counter()
 
         fn = output_name.format(idx=idx)
         print(f"Done in {t1 - t0:.1f}s. Saving {fn}")
+        # bring into PIL format and save
+        x = x.clip(-1, 1)
+        x = rearrange(x[0], "c h w -> h w c")
+
+        img = Image.fromarray((127.5 * (x + 1.0)))
 
         if loop:
             print("-" * 80)
 
@@ -1,4 +1,3 @@
-import jax
 from chex import Array
 from einops import rearrange
 from flax import nnx
@@ -16,7 +15,7 @@ def attention(q: Array, k: Array, v: Array, pe: Array) -> Array:
 
 def rope(pos: Array, dim: int, theta: int) -> Array:
     assert dim % 2 == 0
-    scale = jnp.arange(0, dim, 2, dtype=jnp.float64, device=pos.device) / dim
+    scale = jnp.arange(0, dim, 2, dtype=jnp.float32) / dim
     omega = 1.0 / (theta**scale)
     out = jnp.einsum("...n,d->...nd", pos, omega)
     out = jnp.stack([jnp.cos(out), -jnp.sin(out), jnp.sin(out), jnp.cos(out)], axis=-1)
 
@@ -3,15 +3,16 @@
 import jax.numpy as jnp
 from chex import Array
 from flax import nnx
-from flux.modules.layers import (
+from jax.typing import DTypeLike
+
+from jflux.modules.layers import (
     DoubleStreamBlock,
     EmbedND,
     LastLayer,
     MLPEmbedder,
     SingleStreamBlock,
     timestep_embedding,
 )
-from jax.typing import DTypeLike
 
 
 @dataclass
@@ -67,8 +68,18 @@ def __init__(self, params: FluxParams):
             rngs=params.rngs,
             param_dtype=params.param_dtype,
         )
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.time_in = MLPEmbedder(
+            in_dim=256,
+            hidden_dim=self.hidden_size,
+            rngs=params.rngs,
+            param_dtype=params.param_dtype,
+        )
+        self.vector_in = MLPEmbedder(
+            params.vec_in_dim,
+            self.hidden_size,
+            rngs=params.rngs,
+            param_dtype=params.param_dtype,
+        )
         self.guidance_in = (
             MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
             if params.guidance_embed
@@ -109,7 +120,13 @@ def __init__(self, params: FluxParams):
             ]
         )
 
-        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.final_layer = LastLayer(
+            self.hidden_size,
+            1,
+            self.out_channels,
+            rngs=params.rngs,
+            param_dtype=params.param_dtype,
+        )
 
     def __call__(
         self,
@@ -136,14 +153,14 @@ def __call__(
         vec = vec + self.vector_in(y)
         txt = self.txt_in(txt)
 
-        ids = jnp.concatenate((txt_ids, img_ids), dim=1)
+        ids = jnp.concatenate((txt_ids, img_ids), axis=1)
         pe = self.pe_embedder(ids)
 
-        for block in self.double_blocks:
+        for block in self.double_blocks.layers:
             img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
 
-        img = jnp.concatenate((txt, img), 1)
-        for block in self.single_blocks:
+        img = jnp.concatenate((txt, img), axis=1)
+        for block in self.single_blocks.layers:
             img = block(img, vec=vec, pe=pe)
         img = img[:, txt.shape[1] :, ...]
 
 
@@ -524,14 +524,27 @@ def __init__(
         self.shift_factor = params.shift_factor
 
     def encode(self, x: Array) -> Array:
+        # rearrange for jax
+        x = rearrange(x, "b c h w -> b h w c")
+
         z = self.reg(self.encoder(x))
         z = self.scale_factor * (z - self.shift_factor)
+
+        # rearrange for jax
+        z = rearrange(z, "b h w c -> b c h w")
         return z
 
     def decode(self, z: Array) -> Array:
+        # rearrange for jax
+        z = rearrange(z, "b c h w -> b h w c")
+
         z = z / self.scale_factor + self.shift_factor
-        return self.decoder(z)
+        z = self.decoder(z)
+
+        # rearrange for jax
+        z = rearrange(z, "b h w c -> b c h w")
+        return z
 
     def __call__(self, x: Array) -> Array:
-        # x -> (b, h, w, c)
+        # x -> (b, c, h, w)
         return self.decode(self.encode(x))
@@ -1,15 +1,13 @@
+# Note: This is a torch module not a Jax module
+from torch import nn
 from chex import Array
-from flax import nnx
-from transformers import (
-    CLIPTokenizer,
-    FlaxCLIPTextModel,
-    FlaxT5EncoderModel,
-    T5Tokenizer,
-)
+import jax.numpy as jnp
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
 
 
-class HFEmbedder(nnx.Module):
-    def __init__(self, version: str, max_length: int, **hf_kwargs) -> None:
+class HFEmbedder(nn.Module):
+    def __init__(self, version: str, max_length: int, **hf_kwargs):
+        super().__init__()
         self.is_clip = version.startswith("openai")
         self.max_length = max_length
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
@@ -18,33 +16,36 @@ def __init__(self, version: str, max_length: int, **hf_kwargs) -> None:
             self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
                 version, max_length=max_length
             )
-            self.hf_module: FlaxCLIPTextModel = FlaxCLIPTextModel.from_pretrained(
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
                 version, **hf_kwargs
             )
         else:
             self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
                 version, max_length=max_length
             )
-            self.hf_module: FlaxT5EncoderModel = FlaxT5EncoderModel.from_pretrained(
-                version, from_pt=True, **hf_kwargs
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
+                version, **hf_kwargs
             )
 
-        self.hf_module = self.hf_module.eval().requires_grad_(False)  # noqa: ignore
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
 
-    def __call__(self, text: list[str]) -> Array:
+    def forward(self, text: list[str]) -> Array:
         batch_encoding = self.tokenizer(
             text,
             truncation=True,
             max_length=self.max_length,
             return_length=False,
             return_overflowing_tokens=False,
             padding="max_length",
-            return_tensors="np",
+            return_tensors="pt",
         )
 
         outputs = self.hf_module(
             input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
             attention_mask=None,
             output_hidden_states=False,
         )
-        return outputs[self.output_key]
+        torch_outputs = outputs[self.output_key]
+
+        jax_outputs = jnp.array(torch_outputs.cpu().float(), dtype=jnp.bfloat16)
+        return jax_outputs