Skip to content

Commit b770d28

Browse files
committed
Use chunk index in document_key in ModelSource
1 parent c88f683 commit b770d28

File tree

2 files changed

+70
-4
lines changed

2 files changed

+70
-4
lines changed

src/django_ai_core/contrib/index/source.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,8 @@ def provides_object(self, obj: object) -> bool:
175175
def provides_document(self, document: Document) -> bool:
176176
return document.document_key.split(":")[0] == self.source_id
177177

178-
def get_document_key(self, obj) -> str:
179-
return f"{self.source_id}:{obj.pk}"
178+
def get_document_key(self, obj, chunk) -> str:
179+
return f"{self.source_id}:{obj.pk}:{chunk}"
180180

181181
def _object_to_documents(self, obj: models.Model) -> Iterable[Document]:
182182
if not self.provides_object(obj):
@@ -185,9 +185,9 @@ def _object_to_documents(self, obj: models.Model) -> Iterable[Document]:
185185
metadata = self.get_metadata(obj)
186186
content = self.get_content(obj)
187187

188-
for document in self.chunk_transformer.transform(content):
188+
for chunk, document in enumerate(self.chunk_transformer.transform(content)):
189189
yield Document(
190-
document_key=self.get_document_key(obj),
190+
document_key=self.get_document_key(obj, chunk),
191191
content=document,
192192
metadata=metadata,
193193
)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import random
2+
3+
import pytest
4+
from testapp.models import Book
5+
6+
from django_ai_core.contrib.index.source import ModelSource
7+
8+
9+
def generate_long_string(length=2000):
10+
words = [
11+
"the",
12+
"quick",
13+
"brown",
14+
"fox",
15+
"jumps",
16+
"over",
17+
"lazy",
18+
"dog",
19+
"artificial",
20+
"intelligence",
21+
"machine",
22+
"learning",
23+
"data",
24+
"science",
25+
"python",
26+
"django",
27+
"wagtail",
28+
"testing",
29+
"development",
30+
"software",
31+
"application",
32+
"framework",
33+
"database",
34+
"model",
35+
"query",
36+
"index",
37+
"search",
38+
"semantic",
39+
"vector",
40+
"embedding",
41+
"algorithm",
42+
"neural",
43+
"network",
44+
"training",
45+
"optimization",
46+
"performance",
47+
"scalability",
48+
]
49+
text = []
50+
current_length = 0
51+
while current_length < length:
52+
word = random.choice(words)
53+
text.append(word)
54+
current_length += len(word) + 1
55+
return " ".join(text).capitalize() + "."
56+
57+
58+
@pytest.mark.django_db
59+
def test_model_source_returns_unique_keys():
60+
for _ in range(5):
61+
Book.objects.create(title="Book Title", description=generate_long_string())
62+
63+
model_source = ModelSource(model=Book)
64+
documents = model_source.get_documents()
65+
document_keys = [doc.document_key for doc in documents]
66+
assert len(document_keys) == len(set(document_keys))

0 commit comments

Comments
 (0)