Skip to content

Commit

Permalink
Prevent error that httpx uses different event loop at method chaining…
Browse files Browse the repository at this point in the history
… on the QA (#785)

* add test code for this error (resolved using different instance of OpenAI easily)

* edit documentation to use separate OpenAI client for reducing errors

---------

Co-authored-by: jeffrey <vkefhdl1@gmail.com>
  • Loading branch information
vkehfdl1 and jeffrey authored Oct 4, 2024
1 parent 832306f commit 784203b
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 23 deletions.
9 changes: 0 additions & 9 deletions docs/source/data_creation/data_creation.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,5 @@
# Data Creation

```{warning}
This is the beta version of new Data Creation.
This will be the main data creation pipeline at the AutoRAG v0.3 release.
At the time, the legacy version of data creation will be deprecated.
Plus, It is developing version. So there are some features that doesn't implemented yet.
And have potential bugs.
```

Data creation is the crucial process to use AutoRAG. Because AutoRAG needs an evaluation dataset for optimizing the RAG pipelines.
The following guide covers how to use LLM to create data in a form that AutoRAG can use.

Expand Down
6 changes: 2 additions & 4 deletions docs/source/data_creation/qa_creation/answer_gen.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@ from autorag.data.qa.schema import QA
from autorag.data.qa.generation_gt.openai_gen_gt import make_basic_gen_gt
from openai import AsyncOpenAI

client = AsyncOpenAI()
qa = QA(qa_df)
result_qa = qa.batch_apply(make_basic_gen_gt, client=client)
result_qa = qa.batch_apply(make_basic_gen_gt, client=AsyncOpenAI())
```

### LlamaIndex
Expand All @@ -48,9 +47,8 @@ from autorag.data.qa.schema import QA
from autorag.data.qa.generation_gt.openai_gen_gt import make_concise_gen_gt
from openai import AsyncOpenAI

client = AsyncOpenAI()
qa = QA(qa_df)
result_qa = qa.batch_apply(make_concise_gen_gt, client=client)
result_qa = qa.batch_apply(make_concise_gen_gt, client=AsyncOpenAI())
```

### LlamaIndex
Expand Down
6 changes: 2 additions & 4 deletions docs/source/data_creation/qa_creation/filter.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,8 @@ from openai import AsyncOpenAI
from autorag.data.qa.schema import QA
from autorag.data.qa.filter.dontknow import dontknow_filter_openai

openai_client = AsyncOpenAI()
qa = QA(qa_df, corpus)
filtered_qa = qa.batch_filter(dontknow_filter_openai, client=openai_client, lang="en").map(
filtered_qa = qa.batch_filter(dontknow_filter_openai, client=AsyncOpenAI(), lang="en").map(
lambda df: df.reset_index(drop=True) # reset index
)
```
Expand Down Expand Up @@ -92,10 +91,9 @@ from openai import AsyncOpenAI
from autorag.data.qa.schema import QA
from autorag.data.qa.filter.passage_dependency import passage_dependency_filter_openai

client = AsyncOpenAI()
en_qa = QA(en_qa_df)
result_en_qa = en_qa.batch_filter(
passage_dependency_filter_openai, client=client, lang="en"
passage_dependency_filter_openai, client=AsyncOpenAI(), lang="en"
).map(lambda df: df.reset_index(drop=True))
```

Expand Down
9 changes: 3 additions & 6 deletions docs/source/data_creation/qa_creation/query_gen.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,8 @@ from openai import AsyncOpenAI
from autorag.data.qa.schema import QA
from autorag.data.qa.query.openai_gen_query import factoid_query_gen

openai_client = AsyncOpenAI()
qa = QA(qa_df)
result_qa = qa.batch_apply(factoid_query_gen, client=openai_client, lang="ko")
result_qa = qa.batch_apply(factoid_query_gen, client=AsyncOpenAI(), lang="ko")
```

- LlamaIndex
Expand Down Expand Up @@ -71,9 +70,8 @@ from openai import AsyncOpenAI
from autorag.data.qa.schema import QA
from autorag.data.qa.query.openai_gen_query import concept_completion_query_gen

openai_client = AsyncOpenAI()
qa = QA(qa_df)
result_qa = qa.batch_apply(concept_completion_query_gen, client=openai_client, lang="ko")
result_qa = qa.batch_apply(concept_completion_query_gen, client=AsyncOpenAI(), lang="ko")
```

- LlamaIndex
Expand Down Expand Up @@ -114,9 +112,8 @@ from openai import AsyncOpenAI
from autorag.data.qa.schema import QA
from autorag.data.qa.query.openai_gen_query import two_hop_incremental

openai_client = AsyncOpenAI()
qa = QA(qa_df)
result_qa = qa.batch_apply(two_hop_incremental, client=openai_client)
result_qa = qa.batch_apply(two_hop_incremental, client=AsyncOpenAI())
```

- LlamaIndex
Expand Down
55 changes: 55 additions & 0 deletions tests/autorag/data/qa/test_data_creation_piepline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from datetime import datetime

import pandas as pd
import pytest
from llama_index.core.llms import MockLLM
from openai import AsyncOpenAI

from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
from autorag.data.qa.generation_gt.llama_index_gen_gt import (
Expand All @@ -12,6 +14,15 @@
from autorag.data.qa.sample import random_single_hop
from autorag.data.qa.schema import Raw

from autorag.data.qa.generation_gt.openai_gen_gt import (
make_basic_gen_gt as openai_make_basic_gen_gt,
make_concise_gen_gt as openai_make_concise_gen_gt,
)
from autorag.data.qa.query.openai_gen_query import (
factoid_query_gen as openai_factoid_query_gen,
)
from tests.delete_tests import is_github_action

initial_raw = Raw(
pd.DataFrame(
{
Expand Down Expand Up @@ -80,3 +91,47 @@ def test_make_dataset_from_raw():
len(retrieval_gt[0]) == 1
for retrieval_gt in initial_qa.data["retrieval_gt"].tolist()
)


@pytest.mark.skipif(is_github_action(), reason="Skipping this test on GitHub Actions")
def test_make_dataset_from_raw_openai():
initial_corpus = initial_raw.chunk(
"llama_index_chunk", chunk_method="token", chunk_size=128, chunk_overlap=5
)

initial_qa = (
initial_corpus.sample(random_single_hop, n=3)
.map(
lambda df: df.reset_index(drop=True),
)
.make_retrieval_gt_contents()
.batch_apply(
openai_factoid_query_gen,
client=AsyncOpenAI(),
)
.batch_apply(
openai_make_basic_gen_gt,
client=AsyncOpenAI(),
)
.batch_apply(
openai_make_concise_gen_gt,
client=AsyncOpenAI(),
)
.filter(
dontknow_filter_rule_based,
lang="en",
)
)
assert len(initial_qa.data) == 3
assert set(initial_qa.data.columns) == {
"qid",
"retrieval_gt",
"generation_gt",
"query",
"retrieval_gt_contents",
}
assert all(len(gen_gt) == 2 for gen_gt in initial_qa.data["generation_gt"].tolist())
assert all(
len(retrieval_gt[0]) == 1
for retrieval_gt in initial_qa.data["retrieval_gt"].tolist()
)

0 comments on commit 784203b

Please sign in to comment.