Skip to content

Commit 5be69aa

Browse files
Fix unstructured noteboook (#408)
* remove-dotenv * fix-doc-error * notebook * doc * wget resource * format --------- Co-authored-by: Nicolò Boschi <[email protected]>
1 parent f7b6b52 commit 5be69aa

File tree

2 files changed

+62
-204
lines changed

2 files changed

+62
-204
lines changed

docs/modules/examples/pages/langchain-unstructured-astra.adoc

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ DB Access Token] with Database Administrator permissions.
2727
Install the following dependencies:
2828
[source,python]
2929
----
30-
pip install ragstack-ai python-dotenv
30+
pip install ragstack-ai
3131
----
3232
See the https://docs.datastax.com/en/ragstack/docs/prerequisites.html[Prerequisites] page for more details.
3333

@@ -163,13 +163,15 @@ for el in elements:
163163
if el.category in ["Header", "Footer"]:
164164
continue # skip these
165165
if el.category == "Title":
166-
documents.append(current_doc)
166+
if current_doc is not None:
167+
documents.append(current_doc)
167168
current_doc = None
168169
if not current_doc:
169170
current_doc = Document(page_content="", metadata=el.metadata.to_dict())
170171
current_doc.page_content += el.metadata.text_as_html if el.category == "Table" else el.text
171172
if el.category == "Table":
172-
documents.append(current_doc)
173+
if current_doc is not None:
174+
documents.append(current_doc)
173175
current_doc = None
174176
175177
astra_db_store.add_documents(documents)
@@ -197,7 +199,7 @@ chain = (
197199

198200
== Execute queries
199201

200-
. Ask a question that should be answered by the text of the document - this query should return a relevant response.
202+
. Ask a question that should be answered by the text of the document - this query should return `Reducing the attention key size hurts model quality.`.
201203
+
202204
[source,python]
203205
----
@@ -206,7 +208,9 @@ print("\n***********New Unstructured Basic Query Engine***********")
206208
print(response_1)
207209
----
208210
+
209-
. Ask a question that can be answered from the table data. This highlights the power of using Unstructured.io.
211+
. Ask a question that can be answered from the table data.
212+
This query should return `The 'WSJ 23 F1' value for 'Dyer et al. (2016) (5]' was 91.7.` because the table data contains this information.
213+
This highlights the power of using Unstructured.io.
210214
+
211215
[source,python]
212216
----
@@ -215,7 +219,7 @@ print("\n***********New Unstructured Basic Query Engine***********")
215219
print(response_2)
216220
----
217221
. Ask a question with an expected lack of context.
218-
This query should return `I don't know. The context does not provide any information about George Washington's birthdate.` because your document does not contain information about the George Washington.
222+
This query should return `I don't know. The context does not provide any information about George Washington's birthdate.` because your document does not contain information about George Washington.
219223
+
220224
[source,python]
221225
----
@@ -235,20 +239,25 @@ import os
235239
import requests
236240
237241
from dotenv import load_dotenv
238-
from langchain_community.document_loaders import unstructured
239242
from langchain_astradb import AstraDBVectorStore
240243
from langchain_core.documents import Document
241244
from langchain_core.output_parsers import StrOutputParser
242245
from langchain_core.prompts import PromptTemplate
243246
from langchain_core.runnables import RunnablePassthrough
244247
248+
from langchain_community.document_loaders import (
249+
unstructured,
250+
UnstructuredAPIFileLoader,
251+
)
252+
245253
from langchain_openai import (
246254
ChatOpenAI,
247255
OpenAIEmbeddings,
248256
)
249257
250258
load_dotenv()
251259
260+
# download pdf
252261
url = "https://raw.githubusercontent.com/datastax/ragstack-ai/48bc55e7dc4de6a8b79fcebcedd242dc1254dd63/examples/notebooks/resources/attention_pages_9_10.pdf"
253262
file_path = "./attention_pages_9_10.pdf"
254263
@@ -259,8 +268,19 @@ if response.status_code == 200:
259268
print("Download complete.")
260269
else:
261270
print("Error downloading the file.")
262-
exit(1)
263271
272+
# simple parse
273+
loader = UnstructuredAPIFileLoader(
274+
file_path="./attention_pages_9_10.pdf",
275+
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
276+
url = os.getenv("UNSTRUCTURED_API_URL"),
277+
)
278+
simple_docs = loader.load()
279+
280+
print(len(simple_docs))
281+
print(simple_docs[0].page_content[0:400])
282+
283+
# complex parse
264284
elements = unstructured.get_elements_from_api(
265285
file_path="./attention_pages_9_10.pdf",
266286
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
@@ -269,31 +289,40 @@ elements = unstructured.get_elements_from_api(
269289
pdf_infer_table_structure=True,
270290
)
271291
292+
print(len(elements))
293+
tables = [el for el in elements if el.category == "Table"]
294+
print(tables[1].metadata.text_as_html)
295+
296+
# create vector store
272297
astra_db_store = AstraDBVectorStore(
273298
collection_name="langchain_unstructured",
274299
embedding=OpenAIEmbeddings(),
275300
token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
276301
api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT")
277302
)
278303
304+
# load documents
279305
documents = []
280306
current_doc = None
281307
282308
for el in elements:
283309
if el.category in ["Header", "Footer"]:
284310
continue # skip these
285311
if el.category == "Title":
286-
documents.append(current_doc)
312+
if current_doc is not None:
313+
documents.append(current_doc)
287314
current_doc = None
288315
if not current_doc:
289316
current_doc = Document(page_content="", metadata=el.metadata.to_dict())
290317
current_doc.page_content += el.metadata.text_as_html if el.category == "Table" else el.text
291318
if el.category == "Table":
292-
documents.append(current_doc)
319+
if current_doc is not None:
320+
documents.append(current_doc)
293321
current_doc = None
294322
295323
astra_db_store.add_documents(documents)
296324
325+
# prompt and query
297326
prompt = """
298327
Answer the question based only on the supplied context. If you don't know the answer, say "I don't know".
299328
Context: {context}
@@ -321,7 +350,6 @@ print(response_2)
321350
response_3 = chain.invoke("When was George Washington born?")
322351
print("\n***********New Unstructured Basic Query Engine***********")
323352
print(response_3)
324-
325353
----
326354
====
327355

0 commit comments

Comments
 (0)