AI-Hypercomputer
diff --git a/‎convert_checkpoints.py
Lines changed: 0 additions & 3 deletions b/‎convert_checkpoints.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎install_everything.sh
Lines changed: 1 addition & 0 deletions b/‎install_everything.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎jetstream_pt/cli.py
Lines changed: 130 additions & 0 deletions b/‎jetstream_pt/cli.py
Lines changed: 130 additions & 0 deletions
diff --git a/‎jetstream_pt/engine.py
Lines changed: 8 additions & 0 deletions b/‎jetstream_pt/engine.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎jetstream_pt/environment.py
Lines changed: 2 additions & 9 deletions b/‎jetstream_pt/environment.py
Lines changed: 2 additions & 9 deletions
@@ -357,9 +357,6 @@ def _load_from_local(input_ckpt_dir: epath.Path):
   if not _FROM_HF.value:
     return _load_orig_llama_weight(input_ckpt_dir)
   else:
-    assert (
-        not FLAGS.quantize_weights
-    ), "Quantization not supported for HF checkpoint."
     return _load_hf_llama_weight(input_ckpt_dir)
 
 
 
@@ -27,6 +27,7 @@ pip show torch_xla2 && pip uninstall -y torch_xla2
 pip install flax
 pip install tensorflow-text
 pip install tensorflow
+pip install huggingface_hub
 
 pip install ray[default]==2.22.0
 # torch cpu
 
@@ -0,0 +1,130 @@
+import sys
+
+# import torch_xla2 first!
+import torch_xla2  # pylint: disable
+import jax
+from absl import app, flags
+from jetstream.core import server_lib
+from jetstream.core.config_lib import ServerConfig, MetricsServerConfig
+import torch
+
+from jetstream_pt import fetch_models
+from jetstream_pt import environment, engine, quantize_model, torchjax
+from jetstream_pt import config
+
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_id", "", "")
+flags.DEFINE_integer("override_batch_size", 32, "The batch size")
+flags.DEFINE_integer("max_input_length", 1024, "The batch size")
+flags.DEFINE_integer("max_output_length", 1024, "The batch size")
+flags.DEFINE_integer("port", 9000, "port to listen on")
+flags.DEFINE_integer("threads", 64, "number of worker threads in thread pool")
+
+
+def shard_weights(env, weights, weight_shardings):
+  """Shard weights according to weight_shardings"""
+  for k, v in weight_shardings.items():
+    print("SHARDING", k, v)
+  sharded = {}
+  for key, val in weights.items():
+    sharding = env.sharding_by_axis(weight_shardings.get(key, -1))
+    with jax.default_device(jax.devices("cpu")[0]):
+      arr = torch_xla2.tensor.t2j(val)
+    arr = jax.device_put(arr, sharding)
+    sharded[key] = torchjax.to_torch(arr)
+  return sharded
+
+
+def create_engine(devices):
+  """Create Pytorch engine from flags"""
+  torch.set_default_dtype(torch.bfloat16)
+  quant_config = config.create_quantization_config_from_flags()
+  env_data = fetch_models.construct_env_data_from_model_id(
+      FLAGS.model_id,
+      FLAGS.override_batch_size,
+      FLAGS.max_input_length,
+      FLAGS.max_output_length,
+      quant_config.enable_weight_quantization,
+  )
+  env = environment.JetEngineEnvironment(env_data)
+  model = fetch_models.instantiate_model_from_repo_id(FLAGS.model_id, env)
+
+  weight_shardings = model.get_sharding_annotations()
+  sharded_weights = shard_weights(env, model.state_dict(), weight_shardings)
+
+  if quant_config.enable_weight_quantization:
+    model.load_state_dict(sharded_weights, assign=True, strict=False)
+    quantize_model.quantize_model(model, quant_config)
+    sharded_weights = model.state_dict()
+
+  return engine.PyTorchEngine(
+      pt_model=model,
+      env=env,
+      weights=torchjax.from_torch_with_copy(sharded_weights),
+  )
+
+
+def list_model():
+  """Print list of models."""
+  for model_id in fetch_models.model_id_to_class:
+    print(model_id)
+
+
+def serve():
+  """Run gRPC server."""
+  if FLAGS.model_id == "":
+    print("Please specify model_id with --model_id")
+    print("valid model ids are:")
+    list_model()
+    sys.exit(1)
+  devices = server_lib.get_devices()
+  print(f"devices: {devices}")
+
+  server_config = ServerConfig(
+      interleaved_slices=(f"tpu={len(jax.devices())}",),
+      interleaved_engine_create_fns=[create_engine],
+  )
+  print(f"server_config: {server_config}")
+
+  metrics_server_config: MetricsServerConfig | None = None
+
+  # We separate credential from run so that we can unit test it with local credentials.
+  # We would like to add grpc credentials for OSS.
+  jetstream_server = server_lib.run(
+      threads=FLAGS.threads,
+      port=FLAGS.port,
+      config=server_config,
+      devices=devices,
+      metrics_server_config=metrics_server_config,
+  )
+  print("Started jetstream_server....")
+  jetstream_server.wait_for_termination()
+
+
+def interactive():
+  """Run interactive"""
+  raise RuntimeError("Not implemented")
+
+
+def main(argv):
+  """Entry point"""
+  if len(argv) < 2:
+    print("Invalid arguments. please specify 'list' or 'serve'")
+
+  if argv[1] == "list":
+    list_model()
+    return
+
+  if argv[1] == "serve":
+    serve()
+    return
+
+  if argv[1] == "interative":
+    interactive()
+    return
+
+
+if __name__ == "__main__":
+  app.run(main)
@@ -19,6 +19,7 @@
 import functools
 import os
 
+import glob
 from etils import epath
 from flax import struct
 import jax
@@ -40,6 +41,9 @@
 from jetstream_pt.third_party.gemma import config as gemma_config, model as gemma_model
 from jetstream_pt.third_party.mixtral import config as mixtral_config, model as mixtral_model
 
+from absl import flags
+
+FLAGS = flags.FLAGS
 
 Mesh = jax.sharding.Mesh
 P = jax.sharding.PartitionSpec
@@ -82,11 +86,13 @@ def __init__(
       self,
       pt_model: torch.nn.Module,
       env: JetEngineEnvironment,
+      weights=None,
   ):
     self.pt_model = pt_model
     self.env = env
     self.default_dtype = jnp.bfloat16 if env.bf16_enable else jnp.float32
     self.rng = jax.random.PRNGKey(0)
+    self.weights = weights
 
     self.y_sharding = env.sharding_by_axis(1)
     self.x_sharding = env.sharding_by_axis(0)
@@ -713,6 +719,8 @@ def _load_from_state_dict(self, path):
 
   # pylint: disable-next=all
   def load_params(self) -> Params:
+    if self.weights is not None:
+      return self.weights
     # We want to fix this: load from files
     with jax.default_device(self.colocated_cpus):
       if self.env.checkpoint_path:
 
@@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Tuple, Dict
-
 import dataclasses
-import yaml
+from typing import Tuple
 
 import jax
 import jax.sharding as jsharding
 from jax.experimental import mesh_utils
 import torch_xla2
+import yaml
 
 
 from jetstream_pt import cache_manager
@@ -36,7 +35,6 @@ class QuantizationConfig:
   is_symmetric_weight: bool = True
 
   enable_activation_quantization: bool = False
-
   enable_kv_quantization: bool = False
 
 
@@ -75,11 +73,6 @@ class JetEngineEnvironmentData:
   # This string must be one of the values of attention_kv_axis_names above
   kv_cache_shard_axis: str = "num_attn_heads"
 
-  # Override sharding axis of a weight by name
-  experimental_sharding_axis_override: Dict[str, int] = dataclasses.field(
-      default_factory=dict
-  )
-
   # QKV fusion has negative performance on TPU, slicing takes longer
   qkv_fusion: bool = False