Skip to content

Commit 9c9fbd7

Browse files
Ayushhgitclaudedavidsbatista
authored
fix: use Document.from_dict in InMemoryDocumentStore.load_from_disk (#11594)
Co-authored-by: Claude Fable 5 <noreply@anthropic.com> Co-authored-by: David S. Batista <dsbatista@gmail.com>
1 parent 02dd845 commit 9c9fbd7

3 files changed

Lines changed: 30 additions & 1 deletion

File tree

haystack/document_stores/in_memory/document_store.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ def load_from_disk(cls, path: str) -> "InMemoryDocumentStore":
403403
documents = data.pop("documents")
404404
cls_object = default_from_dict(cls, data)
405405
cls_object.write_documents(
406-
documents=[Document(**doc) for doc in documents], policy=DuplicatePolicy.OVERWRITE
406+
documents=[Document.from_dict(doc) for doc in documents], policy=DuplicatePolicy.OVERWRITE
407407
)
408408
return cls_object
409409

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
fixes:
3+
- |
4+
Fixed ``InMemoryDocumentStore.load_from_disk`` reconstructing documents with the plain
5+
``Document`` constructor instead of ``Document.from_dict``. Documents containing a ``blob``
6+
(``ByteStream``) or a ``sparse_embedding`` (``SparseEmbedding``) were loaded with those fields
7+
as raw dictionaries, which later crashed ``repr()``, ``to_dict()``, equality comparison,
8+
``save_to_disk`` of the reloaded store, and any component accessing ``document.blob.data``.

test/document_stores/test_in_memory.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import pytest
1414

1515
from haystack import Document
16+
from haystack.dataclasses import ByteStream, SparseEmbedding
1617
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
1718
from haystack.document_stores.in_memory import InMemoryDocumentStore
1819
from haystack.testing.document_store import (
@@ -146,6 +147,26 @@ def test_save_to_disk_and_load_from_disk(self, in_memory_doc_store: InMemoryDocu
146147
assert list(document_store_loaded.storage.values()) == docs
147148
assert document_store_loaded.to_dict() == in_memory_doc_store.to_dict()
148149

150+
def test_save_to_disk_and_load_from_disk_with_blob_and_sparse_embedding(
151+
self, in_memory_doc_store: InMemoryDocumentStore, tmp_dir: str
152+
) -> None:
153+
doc = Document(
154+
content="document with binary data",
155+
blob=ByteStream(data=b"binary data", mime_type="image/png"),
156+
sparse_embedding=SparseEmbedding(indices=[0, 5], values=[0.1, 0.9]),
157+
)
158+
in_memory_doc_store.write_documents([doc])
159+
save_path = tmp_dir + "/in_memory_doc_store.json"
160+
in_memory_doc_store.save_to_disk(save_path)
161+
document_store_loaded = InMemoryDocumentStore.load_from_disk(save_path)
162+
163+
loaded_doc = document_store_loaded.filter_documents()[0]
164+
assert isinstance(loaded_doc.blob, ByteStream)
165+
assert isinstance(loaded_doc.sparse_embedding, SparseEmbedding)
166+
assert loaded_doc == doc
167+
# The loaded store must be savable again
168+
document_store_loaded.save_to_disk(save_path)
169+
149170
def test_invalid_bm25_algorithm(self):
150171
with pytest.raises(ValueError, match="BM25 algorithm 'invalid' is not supported"):
151172
InMemoryDocumentStore(bm25_algorithm="invalid") # type: ignore[arg-type]

0 commit comments

Comments
 (0)