Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,11 @@ pip install dingo-python

```python
from dingo.config.config import DynamicLLMConfig
from dingo.io.input.MetaData import MetaData
from dingo.io.input.Data import Data
from dingo.model.llm.llm_text_quality_model_base import LLMTextQualityModelBase
from dingo.model.rule.rule_common import RuleEnterAndSpace

data = MetaData(
data = Data(
data_id='123',
prompt="hello, introduce the world",
content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
Expand Down Expand Up @@ -283,7 +283,7 @@ If the built-in rules don't meet your requirements, you can create custom ones:
from dingo.model import Model
from dingo.model.rule.base import BaseRule
from dingo.config.config import DynamicRuleConfig
from dingo.io import MetaData
from dingo.io import Data
from dingo.model.modelres import ModelRes

@Model.rule_register('QUALITY_BAD_RELEVANCE', ['default'])
Expand All @@ -293,7 +293,7 @@ class MyCustomRule(BaseRule):
dynamic_config = DynamicRuleConfig(pattern=r'your_pattern_here')

@classmethod
def eval(cls, input_data: MetaData) -> ModelRes:
def eval(cls, input_data: Data) -> ModelRes:
res = ModelRes()
# Your rule implementation here
return res
Expand Down Expand Up @@ -343,7 +343,7 @@ from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.appName("Dingo").getOrCreate()
spark_rdd = spark.sparkContext.parallelize([...]) # Your data as MetaData objects
spark_rdd = spark.sparkContext.parallelize([...]) # Your data as Data objects

input_args = InputArgs(eval_group="default", save_data=True)
executor = Executor.exec_map["spark"](input_args, spark_session=spark, spark_rdd=spark_rdd)
Expand Down
10 changes: 5 additions & 5 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@ pip install dingo-python

```python
from dingo.config.config import DynamicLLMConfig
from dingo.io.input.MetaData import MetaData
from dingo.io.input.Data import Data
from dingo.model.llm.llm_text_quality_model_base import LLMTextQualityModelBase
from dingo.model.rule.rule_common import RuleEnterAndSpace

data = MetaData(
data = Data(
data_id='123',
prompt="hello, introduce the world",
content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
Expand Down Expand Up @@ -283,7 +283,7 @@ input_data = {
from dingo.model import Model
from dingo.model.rule.base import BaseRule
from dingo.config.config import DynamicRuleConfig
from dingo.io import MetaData
from dingo.io import Data
from dingo.model.modelres import ModelRes

@Model.rule_register('QUALITY_BAD_RELEVANCE', ['default'])
Expand All @@ -293,7 +293,7 @@ class MyCustomRule(BaseRule):
dynamic_config = DynamicRuleConfig(pattern=r'your_pattern_here')

@classmethod
def eval(cls, input_data: MetaData) -> ModelRes:
def eval(cls, input_data: Data) -> ModelRes:
res = ModelRes()
# 您的规则实现
return res
Expand Down Expand Up @@ -343,7 +343,7 @@ from pyspark.sql import SparkSession

# 初始化Spark
spark = SparkSession.builder.appName("Dingo").getOrCreate()
spark_rdd = spark.sparkContext.parallelize([...]) # 以MetaData对象形式的数据
spark_rdd = spark.sparkContext.parallelize([...]) # 以Data对象形式的数据

input_args = InputArgs(eval_group="default", save_data=True)
executor = Executor.exec_map["spark"](input_args, spark_session=spark, spark_rdd=spark_rdd)
Expand Down
18 changes: 9 additions & 9 deletions dingo/data/converter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Callable, Dict, List, Protocol, Union

from dingo.data.converter.img_utils import find_s3_image
from dingo.io import InputArgs, MetaData
from dingo.io import Data, InputArgs


class ConverterProto(Protocol):
Expand Down Expand Up @@ -73,7 +73,7 @@ def _convert(raw: Union[str, Dict]):
prompt += dialogs[-1]["role"]
content += dialogs[-1]["content"]

return MetaData(**{
return Data(**{
'data_id': j["_id"],
'prompt': prompt,
'content': content,
Expand Down Expand Up @@ -134,7 +134,7 @@ def _convert(raw: Union[str, Dict]):
content += '\n\n'
content += f"user: {turn.get('user', '')}"
content += f"\n\nassistant: {turn.get('bot', '')}"
yield MetaData(**{
yield Data(**{
'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id),
'prompt': '',
'content': content,
Expand All @@ -160,7 +160,7 @@ def _convert(raw: Union[str, Dict]):
if isinstance(raw, str):
j = json.loads(raw)
for k, v in j.items():
yield MetaData(**{
yield Data(**{
'data_id': cls.find_levels_data(v, input_args.column_id) if input_args.column_id != '' else str(k),
'prompt': cls.find_levels_data(v, input_args.column_prompt) if input_args.column_prompt != '' else '',
'content': cls.find_levels_data(v, input_args.column_content) if input_args.column_content != '' else '',
Expand All @@ -185,7 +185,7 @@ def convertor(cls, input_args: InputArgs) -> Callable:
def _convert(raw: Union[str, Dict]):
if isinstance(raw, Dict):
raw = json.dumps(raw)
data = MetaData(**{
data = Data(**{
'data_id': str(cls.data_id),
'prompt': '',
'content': raw,
Expand Down Expand Up @@ -214,7 +214,7 @@ def _convert(raw: Union[str, Dict]):
if isinstance(raw, str):
j = json.loads(raw)
cls.data_id += 1
return MetaData(**{
return Data(**{
'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id),
'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '',
'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '',
Expand Down Expand Up @@ -242,7 +242,7 @@ def _convert(raw: Union[str, Dict]):
if isinstance(raw, str):
l_j = json.loads(raw)
for j in l_j:
yield MetaData(**{
yield Data(**{
'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id),
'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '',
'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '',
Expand Down Expand Up @@ -271,7 +271,7 @@ def _convert(raw: Union[str, Dict]):
if isinstance(raw, str):
j = json.loads(raw)
cls.data_id += 1
return MetaData(**{
return Data(**{
'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id),
'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '',
'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '',
Expand Down Expand Up @@ -300,7 +300,7 @@ def _convert(raw: Union[str, Dict]):
if isinstance(raw, str):
j = json.loads(raw)
cls.data_id += 1
return MetaData(**{
return Data(**{
'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id),
'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '',
'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '',
Expand Down
4 changes: 2 additions & 2 deletions dingo/data/dataset/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from dingo.data.converter import BaseConverter, converters
from dingo.data.datasource.base import DataSource
from dingo.io import InputArgs, MetaData
from dingo.io import Data, InputArgs
from dingo.utils import log


Expand Down Expand Up @@ -102,7 +102,7 @@ def to_dict(self) -> Dict[str, str]:
}

@abstractmethod
def get_data(self, **kwargs) -> Generator[MetaData, None, None]:
def get_data(self, **kwargs) -> Generator[Data, None, None]:
"""Eval Data Generator.
"""

Expand Down
6 changes: 3 additions & 3 deletions dingo/data/dataset/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from dingo.data.datasource import DataSource
from dingo.data.datasource.huggingface import HuggingFaceSource
from dingo.data.utils.digit import compute_pandas_digest
from dingo.io import MetaData
from dingo.io import Data

_MAX_ROWS_FOR_DIGEST_COMPUTATION_AND_SCHEMA_INFERENCE = 10000

Expand Down Expand Up @@ -73,15 +73,15 @@ def to_dict(self) -> Dict[str, str]:
)
return config

def get_data(self) -> Generator[MetaData, None, None]:
def get_data(self) -> Generator[Data, None, None]:
"""
Returns the input model for the dataset.
Convert data here.
"""
for data_raw in self._ds:
if self._converter == "plaintext":
data_raw = data_raw[self._targets]
data: Union[Generator[MetaData], MetaData] = self.converter(data_raw)
data: Union[Generator[Data], Data] = self.converter(data_raw)
if isinstance(data, Generator):
for d in data:
yield d
Expand Down
6 changes: 3 additions & 3 deletions dingo/data/dataset/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dingo.data.dataset.base import Dataset
from dingo.data.datasource import DataSource
from dingo.data.datasource.local import LocalDataSource
from dingo.io import MetaData
from dingo.io import Data


@Dataset.register()
Expand Down Expand Up @@ -58,13 +58,13 @@ def to_dict(self) -> Dict[str, str]:
)
return config

def get_data(self) -> Generator[MetaData, None, None]:
def get_data(self) -> Generator[Data, None, None]:
"""
Returns the input model for the dataset.
Convert data here.
"""
for data_raw in self._ds:
data: Union[Generator[MetaData], MetaData] = self.converter(data_raw)
data: Union[Generator[Data], Data] = self.converter(data_raw)
if isinstance(data, Generator):
for d in data:
yield d
Expand Down
8 changes: 4 additions & 4 deletions dingo/data/dataset/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dingo.data.dataset.base import Dataset
from dingo.data.datasource import DataSource
from dingo.data.utils.digit import compute_pandas_digest
from dingo.io import MetaData
from dingo.io import Data
from dingo.utils import log


Expand Down Expand Up @@ -75,16 +75,16 @@ def to_dict(self) -> Dict[str, str]:
)
return config

def get_data(self) -> Generator[MetaData, None, None]:
def get_data(self) -> Generator[Data, None, None]:
"""
Returns the input model for the dataset.
But convert data in executor.
"""
for data_raw in self._ds:
if self.source.get_source_type() == "hugging_face" and self.input_args.data_format == "plaintext":
data_raw = data_raw[self.input_args.column_content]
data: Union[Generator[MetaData, None, None], MetaData] = self.converter(data_raw)
if not isinstance(data, MetaData):
data: Union[Generator[Data, None, None], Data] = self.converter(data_raw)
if not isinstance(data, Data):
for d in data:
try:
yield d
Expand Down
2 changes: 1 addition & 1 deletion dingo/exec/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from functools import wraps
from typing import Any, Dict, List, Protocol, Type, Union

from dingo.io import MetaData, SummaryModel
from dingo.io import Data, SummaryModel


class ExecProto(Protocol):
Expand Down
12 changes: 6 additions & 6 deletions dingo/exec/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from dingo.config import GlobalConfig
from dingo.data import Dataset, DataSource, dataset_map, datasource_map
from dingo.exec.base import ExecProto, Executor
from dingo.io import InputArgs, MetaData, ResultInfo, SummaryModel
from dingo.io import Data, InputArgs, ResultInfo, SummaryModel
from dingo.model import Model
from dingo.model.llm.base import BaseLLM
from dingo.model.modelres import ModelRes
Expand All @@ -28,14 +28,14 @@ def __init__(self, input_args: InputArgs):
self.llm: Optional[BaseLLM] = None
self.summary: SummaryModel = SummaryModel()

def load_data(self) -> Generator[MetaData, None, None]:
def load_data(self) -> Generator[Data, None, None]:
"""
Reads data from given path.

**Run in executor.**

Returns:
Generator[MetaData]
Generator[Data]
"""
datasource_cls = datasource_map[self.input_args.dataset]
dataset_cls = dataset_map[self.input_args.dataset]
Expand Down Expand Up @@ -123,7 +123,7 @@ def process_batch(batch: List):

log.debug('[Summary]: ' + str(self.summary))

def evaluate_single_data(self, group_type, group, data: MetaData):
def evaluate_single_data(self, group_type, group, data: Data):
result_info = ResultInfo(data_id=data.data_id, prompt=data.prompt, content=data.content)
if self.input_args.save_raw:
result_info.raw_data = data.raw_data
Expand Down Expand Up @@ -167,7 +167,7 @@ def evaluate_single_data(self, group_type, group, data: MetaData):
result_info.reason_list.append(reason)
return result_info

def evaluate_rule(self, group: List[BaseRule], d: MetaData) -> ResultInfo:
def evaluate_rule(self, group: List[BaseRule], d: Data) -> ResultInfo:
result_info = ResultInfo(data_id=d.data_id, prompt=d.prompt, content=d.content)
log.debug("[RuleGroup]: " + str(group))
bad_type_list = []
Expand Down Expand Up @@ -199,7 +199,7 @@ def evaluate_rule(self, group: List[BaseRule], d: MetaData) -> ResultInfo:
result_info.reason_list = good_reason_list
return result_info

def evaluate_prompt(self, group: List[BasePrompt], d: MetaData) -> ResultInfo:
def evaluate_prompt(self, group: List[BasePrompt], d: Data) -> ResultInfo:
result_info = ResultInfo(data_id=d.data_id, prompt=d.prompt, content=d.content)
log.debug("[PromptGroup]: " + str(group))
bad_type_list = []
Expand Down
8 changes: 4 additions & 4 deletions dingo/exec/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dingo.config import GlobalConfig
from dingo.data import Dataset, DataSource, dataset_map, datasource_map
from dingo.exec.base import ExecProto, Executor
from dingo.io import InputArgs, MetaData, ResultInfo, SummaryModel
from dingo.io import Data, InputArgs, ResultInfo, SummaryModel
from dingo.model import Model
from dingo.model.llm.base import BaseLLM
from dingo.model.modelres import ModelRes
Expand Down Expand Up @@ -146,7 +146,7 @@ def evaluate(self):

def evaluate_item(self, data_rdd_item, broadcast_group, broadcast_llm) -> Dict[str, Any]:
"""Evaluate a single data item using broadcast variables."""
data: MetaData = data_rdd_item
data: Data = data_rdd_item
result_info = ResultInfo(data_id=data.data_id, prompt=data.prompt, content=data.content)

if self.input_args.save_raw:
Expand Down Expand Up @@ -192,7 +192,7 @@ def evaluate_item(self, data_rdd_item, broadcast_group, broadcast_llm) -> Dict[s

return result_info.to_dict()

def evaluate_rule(self, group: List[BaseRule], data: MetaData) -> ResultInfo:
def evaluate_rule(self, group: List[BaseRule], data: Data) -> ResultInfo:
"""Evaluate data against a group of rules."""
result_info = ResultInfo(data_id=data.data_id, prompt=data.prompt, content=data.content)

Expand Down Expand Up @@ -224,7 +224,7 @@ def evaluate_rule(self, group: List[BaseRule], data: MetaData) -> ResultInfo:

return result_info

def evaluate_prompt(self, group: List[BasePrompt], data: MetaData, llm: BaseLLM) -> ResultInfo:
def evaluate_prompt(self, group: List[BasePrompt], data: Data, llm: BaseLLM) -> ResultInfo:
"""Evaluate data against a group of prompts using LLM."""
if llm is None:
raise ValueError("LLM is required for prompt evaluation")
Expand Down
2 changes: 1 addition & 1 deletion dingo/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dingo.io.input.Data import Data
from dingo.io.input.InputArgs import InputArgs
from dingo.io.input.MetaData import MetaData
from dingo.io.output.ResultInfo import ResultInfo
from dingo.io.output.SummaryModel import SummaryModel
4 changes: 2 additions & 2 deletions dingo/io/input/MetaData.py → dingo/io/input/Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from pydantic import BaseModel


class MetaData(BaseModel):
class Data(BaseModel):
"""
Metadata, output of converter.
Data, output of converter.
"""
data_id: str
prompt: str = None
Expand Down
4 changes: 2 additions & 2 deletions dingo/model/llm/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Protocol

from dingo.io import MetaData
from dingo.io import Data
from dingo.model.modelres import ModelRes
from dingo.model.prompt.base import BasePrompt

Expand All @@ -11,5 +11,5 @@ def set_prompt(cls, prompt: BasePrompt):
...

@classmethod
def eval(cls, input_data: MetaData) -> ModelRes:
def eval(cls, input_data: Data) -> ModelRes:
...
Loading