Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion haystack/document_stores/in_memory/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def load_from_disk(cls, path: str) -> "InMemoryDocumentStore":
documents = data.pop("documents")
cls_object = default_from_dict(cls, data)
cls_object.write_documents(
documents=[Document(**doc) for doc in documents], policy=DuplicatePolicy.OVERWRITE
documents=[Document.from_dict(doc) for doc in documents], policy=DuplicatePolicy.OVERWRITE
)
return cls_object

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
fixes:
- |
Fixed ``InMemoryDocumentStore.load_from_disk`` reconstructing documents with the plain
``Document`` constructor instead of ``Document.from_dict``. Documents containing a ``blob``
(``ByteStream``) or a ``sparse_embedding`` (``SparseEmbedding``) were loaded with those fields
as raw dictionaries, which later crashed ``repr()``, ``to_dict()``, equality comparison,
``save_to_disk`` of the reloaded store, and any component accessing ``document.blob.data``.
21 changes: 21 additions & 0 deletions test/document_stores/test_in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pytest

from haystack import Document
from haystack.dataclasses import ByteStream, SparseEmbedding
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.testing.document_store import (
Expand Down Expand Up @@ -146,6 +147,26 @@ def test_save_to_disk_and_load_from_disk(self, in_memory_doc_store: InMemoryDocu
assert list(document_store_loaded.storage.values()) == docs
assert document_store_loaded.to_dict() == in_memory_doc_store.to_dict()

def test_save_to_disk_and_load_from_disk_with_blob_and_sparse_embedding(
self, in_memory_doc_store: InMemoryDocumentStore, tmp_dir: str
) -> None:
doc = Document(
content="document with binary data",
blob=ByteStream(data=b"binary data", mime_type="image/png"),
sparse_embedding=SparseEmbedding(indices=[0, 5], values=[0.1, 0.9]),
)
in_memory_doc_store.write_documents([doc])
save_path = tmp_dir + "/in_memory_doc_store.json"
in_memory_doc_store.save_to_disk(save_path)
document_store_loaded = InMemoryDocumentStore.load_from_disk(save_path)

loaded_doc = document_store_loaded.filter_documents()[0]
assert isinstance(loaded_doc.blob, ByteStream)
assert isinstance(loaded_doc.sparse_embedding, SparseEmbedding)
assert loaded_doc == doc
# The loaded store must be savable again
document_store_loaded.save_to_disk(save_path)

def test_invalid_bm25_algorithm(self):
with pytest.raises(ValueError, match="BM25 algorithm 'invalid' is not supported"):
InMemoryDocumentStore(bm25_algorithm="invalid") # type: ignore[arg-type]
Expand Down
Loading