Datasets:
Tasks:
Text Generation
Modalities:
Text
Formats:
parquet
Languages:
English
Size:
10K - 100K
ArXiv:
License:
| import hashlib | |
| import random | |
| import json | |
| from datasets import load_dataset, Dataset, concatenate_datasets | |
| import orjson as json | |
| import pandas as pd | |
| from distilabel.steps import make_generator_step | |
| from distilabel.pipeline import Pipeline | |
| from distilabel.steps import MinHashDedup | |
| ds_qwen_final = load_dataset("argilla-warehouse/synth-apigen-qwen", split="train") | |
| ds_llama_final = load_dataset("argilla-warehouse/synth-apigen-llama", split="train") | |
| ds_qwen_filtered = ds_qwen_final.filter(lambda x: x["keep_row_after_execution_check"] and x["keep_row_after_semantic_check"]) | |
| ds_llama_filtered = ds_llama_final.filter(lambda x: x["keep_row_after_execution_check"] and x["keep_row_after_semantic_check"]) | |
| def add_hash_id(row): | |
| row["hash_id"] = hashlib.sha256((row["query"] + row["answers"]).encode("utf-8")).hexdigest() | |
| return row | |
| ds_qwen_filtered_with_hash = ds_qwen_filtered.map(add_hash_id) | |
| ds_llama_filtered_with_hash = ds_llama_filtered.map(add_hash_id) | |
| select_columns = [ | |
| 'func_name', | |
| 'func_desc', | |
| 'tools', | |
| 'query', | |
| 'answers', | |
| 'model_name', | |
| 'hash_id' | |
| ] | |
| df_llama = ds_llama_filtered_with_hash.select_columns(select_columns).to_pandas() | |
| df_qwen = ds_qwen_filtered_with_hash.select_columns(select_columns).to_pandas() | |
| df = pd.concat([df_qwen, df_llama], axis=0).drop_duplicates("hash_id") | |
| ds_full = Dataset.from_pandas(df, preserve_index=False) | |
| with Pipeline(name="synth-apigen-train") as pipeline: | |
| batch_size = 1000 | |
| loader = make_generator_step(ds_full, batch_size=batch_size) | |
| minhash_dedup = MinHashDedup( | |
| tokenizer="ngrams", | |
| n=5, | |
| threshold=0.9, | |
| storage="dict", | |
| input_mappings={"text": "query"} | |
| ) | |
| loader >> minhash_dedup | |
| if __name__ == "__main__": | |
| distiset = pipeline.run(use_cache=False) | |
| ds = distiset["default"]["train"] | |
| def read_jsonl(path): | |
| with open(path, "r") as f: | |
| data = {} | |
| for row in f: | |
| tool = json.loads(row) | |
| data[tool["function"]["name"]] = tool | |
| return data | |
| from pathlib import Path | |
| p = Path.home() / "Downloads/synth_lib/tools.jsonl" | |
| tools = read_jsonl(p) | |
| # Relevance detection for training | |
| tool_calls = list(tools.values()) | |
| # Distribution of the number of tools added | |
| extra_tools = {0: 0.3, 1: 0.2, 2: 0.2, 3: 0.1, 4: 0.1, 5: 0.1} | |
| tool_number = list(extra_tools.keys()) | |
| tool_prob = list(extra_tools.values()) | |
| no_tool = json.dumps([]) | |
| def sample_new_tool(current_tool, tool_calls): | |
| new_tool = current_tool | |
| while current_tool == new_tool: | |
| new_tool = random.choice(tool_calls) | |
| return json.dumps([new_tool]) | |
| def remove_parameter(tool): | |
| new_tool = tool.copy() | |
| if len(new_tool["function"]["parameters"]["properties"]) > 1: | |
| param_names = list(new_tool["function"]["parameters"]["properties"]) | |
| to_remove = random.choice(param_names) | |
| new_tool["function"]["parameters"]["properties"].pop(to_remove) | |
| if ("required" in new_tool["function"]["parameters"]) and (to_remove in new_tool["function"]["parameters"]["required"]): | |
| new_tool["function"]["parameters"]["required"].remove(to_remove) | |
| return json.dumps([new_tool]) | |
| def remove_tools(row): | |
| p = random.random() | |
| current_tool = json.loads(row["tools"])[0] | |
| if p < (1/3): | |
| # Scenario 1: | |
| # The provided tools cannot solve the query (e.g., query: "I want to know the weather in Palo Alto | |
| # on Dec 25, 2023," provided tool: get_house_price(city)) | |
| row["tools"] = sample_new_tool(current_tool, tool_calls) | |
| row["answers"] = "The query cannot be answered with the provided tools." | |
| elif p < (2/3): | |
| # Scenario 2: | |
| # The provided tools are missing key arguments to solve the query (e.g., query: "I want to know the | |
| # weather in Palo Alto on Dec 25, 2023," provided tool: get_weather(city)). | |
| row["tools"] = remove_parameter(current_tool) | |
| row["answers"] = "The given question lacks the parameters required by the function." | |
| else: | |
| # Scenario 3: No tools provided | |
| row["tools"] = no_tool | |
| row["answers"] = "The query cannot be answered, no tools were provided." | |
| return row | |
| def add_random_tools(row): | |
| # Only update those that already have tools | |
| current_tools = json.loads(row["tools"]) | |
| if len(current_tools) > 0: | |
| current_tool = current_tools[0] | |
| k = random.choices(tool_number, tool_prob)[0] | |
| new_tools = random.choices(tool_calls, k=k) | |
| new_tools.append(current_tool) | |
| row["tools"] = json.dumps(random.sample(new_tools, len(new_tools))) | |
| return row | |
| columns = ['func_name', 'func_desc', 'tools', 'query', 'answers', 'model_name', 'hash_id'] | |
| ds_filtered = ds.filter(lambda x: x["keep_row_after_minhash_filtering"]).select_columns(columns) | |
| selection = ds_filtered.shuffle(422).select(range(6000)) | |
| new_ds = concatenate_datasets( | |
| [ | |
| selection.map(remove_tools, num_proc=4), | |
| ds | |
| ] | |
| ) | |
| new_ds = new_ds.map(add_random_tools, num_proc=4) | |
| print("Value counts:") | |
| new_df = new_ds.to_pandas() | |
| print(new_df["tools"].apply(lambda x: json.loads(x)).apply(len).value_counts()) | |
| new_ds.push_to_hub("argilla-warehouse/synth-apigen-v0.2", private=True) |