Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions model2vec/distill/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def create_embeddings(
:param pad_token_id: The pad token id. Used to pad sequences.
:return: The output embeddings.
"""
model = model.to(device)
model = model.to(device) # type: ignore

out_weights: np.ndarray
intermediate_weights: list[np.ndarray] = []
Expand Down Expand Up @@ -98,7 +98,7 @@ def _encode_mean_using_model(model: PreTrainedModel, encodings: dict[str, torch.
"""
encodings = {k: v.to(model.device) for k, v in encodings.items()}
encoded: BaseModelOutputWithPoolingAndCrossAttentions = model(**encodings)
out: torch.Tensor = encoded.last_hidden_state.cpu()
out: torch.Tensor = encoded.last_hidden_state.cpu() # type: ignore # typing is wrong.
# NOTE: If the dtype is bfloat 16, we convert to float32,
# because numpy does not suport bfloat16
# See here: https://github.com/numpy/numpy/issues/19808
Expand Down
69 changes: 49 additions & 20 deletions model2vec/hf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import safetensors
from huggingface_hub import ModelCard, ModelCardData
from huggingface_hub.constants import HF_HUB_CACHE
from safetensors.numpy import save_file
from tokenizers import Tokenizer

Expand Down Expand Up @@ -96,9 +97,10 @@ def _create_model_card(

def load_pretrained(
folder_or_repo_path: str | Path,
subfolder: str | None = None,
token: str | None = None,
from_sentence_transformers: bool = False,
subfolder: str | None,
token: str | None,
from_sentence_transformers: bool,
force_download: bool,
) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
"""
Loads a pretrained model from a folder.
Expand All @@ -109,6 +111,8 @@ def load_pretrained(
:param subfolder: The subfolder to load from.
:param token: The huggingface token to use.
:param from_sentence_transformers: Whether to load the model from a sentence transformers model.
:param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
already present in the cache.
:raises: FileNotFoundError if the folder exists, but the file does not exist locally.
:return: The embeddings, tokenizer, config, and metadata.

Expand All @@ -122,7 +126,13 @@ def load_pretrained(
tokenizer_file = "tokenizer.json"
config_name = "config.json"

folder_or_repo_path = Path(folder_or_repo_path)
cached_folder = _get_latest_model_path(str(folder_or_repo_path))
if cached_folder and not force_download:
logger.info(f"Found cached model at {cached_folder}, loading from cache.")
folder_or_repo_path = cached_folder
else:
logger.info(f"No cached model found for {folder_or_repo_path}, loading from local or hub.")
folder_or_repo_path = Path(folder_or_repo_path)

local_folder = folder_or_repo_path / subfolder if subfolder else folder_or_repo_path

Expand All @@ -139,9 +149,7 @@ def load_pretrained(
if not tokenizer_path.exists():
raise FileNotFoundError(f"Tokenizer file does not exist in {local_folder}")

# README is optional, so this is a bit finicky.
readme_path = local_folder / "README.md"
metadata = _get_metadata_from_readme(readme_path)

else:
logger.info("Folder does not exist locally, attempting to use huggingface hub.")
Expand All @@ -150,18 +158,11 @@ def load_pretrained(
folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
)
)

try:
readme_path = Path(
huggingface_hub.hf_hub_download(
folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
)
readme_path = Path(
huggingface_hub.hf_hub_download(
folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
)
metadata = _get_metadata_from_readme(Path(readme_path))
except Exception as e:
# NOTE: we don't want to raise an error here, since the README is optional.
logger.info(f"No README found in the model folder: {e} No model card loaded.")
metadata = {}
)

config_path = Path(
huggingface_hub.hf_hub_download(
Expand All @@ -175,10 +176,13 @@ def load_pretrained(
)

opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
if from_sentence_transformers:
embeddings = opened_tensor_file.get_tensor("embedding.weight")
embedding_key = "embedding.weight" if from_sentence_transformers else "embeddings"
embeddings = opened_tensor_file.get_tensor(embedding_key)

if readme_path.exists():
metadata = _get_metadata_from_readme(readme_path)
else:
embeddings = opened_tensor_file.get_tensor("embeddings")
metadata = {}

tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
config = json.load(open(config_path))
Expand Down Expand Up @@ -223,3 +227,28 @@ def push_folder_to_hub(
huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token, path_in_repo=subfolder)

logger.info(f"Pushed model to {repo_id}")


def _get_latest_model_path(model_id: str) -> Path | None:
"""
Gets the latest model path for a given identifier from the hugging face hub cache.

Returns None if there is no cached model. In this case, the model will be downloaded.
"""
# Make path object
cache_dir = Path(HF_HUB_CACHE)
# This is specific to how HF stores the files.
normalized = model_id.replace("/", "--")
repo_dir = cache_dir / f"models--{normalized}" / "snapshots"

if not repo_dir.exists():
return None

# Find all directories.
snapshots = [p for p in repo_dir.iterdir() if p.is_dir()]
if not snapshots:
return None

# Get the latest directory by modification time.
latest_snapshot = max(snapshots, key=lambda p: p.stat().st_mtime)
return latest_snapshot
35 changes: 9 additions & 26 deletions model2vec/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from tqdm import tqdm

from model2vec.quantization import DType, quantize_and_reduce_dim
from model2vec.utils import ProgressParallel, load_local_model
from model2vec.utils import ProgressParallel

PathLike = Union[Path, str]

Expand Down Expand Up @@ -156,6 +156,7 @@ def from_pretrained(
subfolder: str | None = None,
quantize_to: str | DType | None = None,
dimensionality: int | None = None,
force_download: bool = True,
) -> StaticModel:
"""
Load a StaticModel from a local path or huggingface hub path.
Expand All @@ -171,6 +172,8 @@ def from_pretrained(
:param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
This is useful if you want to load a model with a lower dimensionality.
Note that this only applies if you have trained your model using mrl or PCA.
:param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
already present in the cache.
:return: A StaticModel.
"""
from model2vec.hf_utils import load_pretrained
Expand All @@ -180,6 +183,7 @@ def from_pretrained(
token=token,
from_sentence_transformers=False,
subfolder=subfolder,
force_download=force_download,
)

embeddings = quantize_and_reduce_dim(
Expand All @@ -205,6 +209,7 @@ def from_sentence_transformers(
normalize: bool | None = None,
quantize_to: str | DType | None = None,
dimensionality: int | None = None,
force_download: bool = True,
) -> StaticModel:
"""
Load a StaticModel trained with sentence transformers from a local path or huggingface hub path.
Expand All @@ -219,6 +224,8 @@ def from_sentence_transformers(
:param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
This is useful if you want to load a model with a lower dimensionality.
Note that this only applies if you have trained your model using mrl or PCA.
:param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
already present in the cache.
:return: A StaticModel.
"""
from model2vec.hf_utils import load_pretrained
Expand All @@ -228,6 +235,7 @@ def from_sentence_transformers(
token=token,
from_sentence_transformers=True,
subfolder=None,
force_download=force_download,
)

embeddings = quantize_and_reduce_dim(
Expand Down Expand Up @@ -447,28 +455,3 @@ def push_to_hub(
with TemporaryDirectory() as temp_dir:
self.save_pretrained(temp_dir, model_name=repo_id)
push_folder_to_hub(Path(temp_dir), subfolder=subfolder, repo_id=repo_id, private=private, token=token)

@classmethod
def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:
"""
Loads a model from a local path.

You should only use this code path if you are concerned with start-up time.
Loading via the `from_pretrained` method is safer, and auto-downloads, but
also means we import a whole bunch of huggingface code that we don't need.

Additionally, huggingface will check the most recent version of the model,
which can be slow.

:param path: The path to load the model from. The path is a directory saved by the
`save_pretrained` method.
:return: A StaticModel
:raises: ValueError if the path is not a directory.
"""
path = Path(path)
if not path.is_dir():
raise ValueError(f"Path {path} is not a directory.")

embeddings, tokenizer, config = load_local_model(path)

return StaticModel(embeddings, tokenizer, config)
24 changes: 0 additions & 24 deletions model2vec/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,27 +102,3 @@ def setup_logging() -> None:
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[RichHandler(rich_tracebacks=True)],
)


def load_local_model(folder: Path) -> tuple[np.ndarray, Tokenizer, dict[str, str]]:
"""Load a local model."""
embeddings_path = folder / "model.safetensors"
tokenizer_path = folder / "tokenizer.json"
config_path = folder / "config.json"

opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
embeddings = opened_tensor_file.get_tensor("embeddings")

if config_path.exists():
config = json.load(open(config_path))
else:
config = {}

tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))

if len(tokenizer.get_vocab()) != len(embeddings):
logger.warning(
f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
)

return embeddings, tokenizer, config
24 changes: 2 additions & 22 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ def test_encode_as_tokens_empty(
encoded = model.encode_as_sequence("")
assert np.array_equal(encoded, np.zeros(shape=(0, 2), dtype=model.embedding.dtype))

encoded = model.encode_as_sequence(["", ""])
encoded_list = model.encode_as_sequence(["", ""])
out = [np.zeros(shape=(0, 2), dtype=model.embedding.dtype) for _ in range(2)]
assert [np.array_equal(x, y) for x, y in zip(encoded, out)]
assert [np.array_equal(x, y) for x, y in zip(encoded_list, out)]


def test_encode_empty_sentence(
Expand Down Expand Up @@ -273,23 +273,3 @@ def test_dim(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: d
model = StaticModel(mock_vectors, mock_tokenizer, mock_config)
assert model.dim == 2
assert model.dim == model.embedding.shape[1]


def test_local_load_from_model(mock_tokenizer: Tokenizer) -> None:
"""Test local load from a model."""
x = np.ones((mock_tokenizer.get_vocab_size(), 2))
with TemporaryDirectory() as tempdir:
tempdir_path = Path(tempdir)
safetensors.numpy.save_file({"embeddings": x}, Path(tempdir) / "model.safetensors")
mock_tokenizer.save(str(Path(tempdir) / "tokenizer.json"))

model = StaticModel.load_local(tempdir_path)
assert model.embedding.shape == x.shape
assert model.tokenizer.to_str() == mock_tokenizer.to_str()
assert model.config == {"normalize": False}


def test_local_load_from_model_no_folder() -> None:
"""Test local load from a model with no folder."""
with pytest.raises(ValueError):
StaticModel.load_local("woahbuddy_relax_this_is_just_a_test")
43 changes: 1 addition & 42 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from model2vec.distill.utils import select_optimal_device
from model2vec.hf_utils import _get_metadata_from_readme
from model2vec.utils import get_package_extras, importable, load_local_model
from model2vec.utils import get_package_extras, importable


def test__get_metadata_from_readme_not_exists() -> None:
Expand Down Expand Up @@ -78,44 +78,3 @@ def test_get_package_extras() -> None:
def test_get_package_extras_empty() -> None:
"""Test package extras with an empty package."""
assert not list(get_package_extras("tqdm", ""))


@pytest.mark.parametrize(
"config, expected",
[
({"dog": "cat"}, {"dog": "cat"}),
({}, {}),
(None, {}),
],
)
def test_local_load(mock_tokenizer: Tokenizer, config: dict[str, Any], expected: dict[str, Any]) -> None:
"""Test local loading."""
x = np.ones((mock_tokenizer.get_vocab_size(), 2))

with TemporaryDirectory() as tempdir:
tempdir_path = Path(tempdir)
safetensors.numpy.save_file({"embeddings": x}, Path(tempdir) / "model.safetensors")
mock_tokenizer.save(str(Path(tempdir) / "tokenizer.json"))
if config is not None:
json.dump(config, open(tempdir_path / "config.json", "w"))
arr, tokenizer, config = load_local_model(tempdir_path)
assert config == expected
assert tokenizer.to_str() == mock_tokenizer.to_str()
assert arr.shape == x.shape


def test_local_load_mismatch(mock_tokenizer: Tokenizer, caplog: pytest.LogCaptureFixture) -> None:
"""Test local loading."""
x = np.ones((10, 2))

with TemporaryDirectory() as tempdir:
tempdir_path = Path(tempdir)
safetensors.numpy.save_file({"embeddings": x}, Path(tempdir) / "model.safetensors")
mock_tokenizer.save(str(Path(tempdir) / "tokenizer.json"))

load_local_model(tempdir_path)
expected = (
f"Number of tokens does not match number of embeddings: `{len(mock_tokenizer.get_vocab())}` vs `{len(x)}`"
)
assert len(caplog.records) == 1
assert caplog.records[0].message == expected
Loading