Datasets:

argilla
/

Synth-APIGen-v0.1

	import hashlib
	import random
	import json

	from datasets import load_dataset, Dataset, concatenate_datasets
	import orjson as json
	import pandas as pd

	from distilabel.steps import make_generator_step
	from distilabel.pipeline import Pipeline

	from distilabel.steps import MinHashDedup


	ds_qwen_final = load_dataset("argilla-warehouse/synth-apigen-qwen", split="train")
	ds_llama_final = load_dataset("argilla-warehouse/synth-apigen-llama", split="train")

	ds_qwen_filtered = ds_qwen_final.filter(lambda x: x["keep_row_after_execution_check"] and x["keep_row_after_semantic_check"])
	ds_llama_filtered = ds_llama_final.filter(lambda x: x["keep_row_after_execution_check"] and x["keep_row_after_semantic_check"])


	def add_hash_id(row):
	row["hash_id"] = hashlib.sha256((row["query"] + row["answers"]).encode("utf-8")).hexdigest()
	return row

	ds_qwen_filtered_with_hash = ds_qwen_filtered.map(add_hash_id)
	ds_llama_filtered_with_hash = ds_llama_filtered.map(add_hash_id)

	select_columns = [
	'func_name',
	'func_desc',
	'tools',
	'query',
	'answers',
	'model_name',
	'hash_id'
	]

	df_llama = ds_llama_filtered_with_hash.select_columns(select_columns).to_pandas()
	df_qwen = ds_qwen_filtered_with_hash.select_columns(select_columns).to_pandas()

	df = pd.concat([df_qwen, df_llama], axis=0).drop_duplicates("hash_id")

	ds_full = Dataset.from_pandas(df, preserve_index=False)


	with Pipeline(name="synth-apigen-train") as pipeline:
	batch_size = 1000

	loader = make_generator_step(ds_full, batch_size=batch_size)
	minhash_dedup = MinHashDedup(
	tokenizer="ngrams",
	n=5,
	threshold=0.9,
	storage="dict",
	input_mappings={"text": "query"}
	)

	loader >> minhash_dedup


	if __name__ == "__main__":
	distiset = pipeline.run(use_cache=False)
	ds = distiset["default"]["train"]

	def read_jsonl(path):
	with open(path, "r") as f:
	data = {}
	for row in f:
	tool = json.loads(row)
	data[tool["function"]["name"]] = tool
	return data

	from pathlib import Path
	p = Path.home() / "Downloads/synth_lib/tools.jsonl"
	tools = read_jsonl(p)

	# Relevance detection for training
	tool_calls = list(tools.values())

	# Distribution of the number of tools added
	extra_tools = {0: 0.3, 1: 0.2, 2: 0.2, 3: 0.1, 4: 0.1, 5: 0.1}
	tool_number = list(extra_tools.keys())
	tool_prob = list(extra_tools.values())
	no_tool = json.dumps([])

	def sample_new_tool(current_tool, tool_calls):
	new_tool = current_tool
	while current_tool == new_tool:
	new_tool = random.choice(tool_calls)

	return json.dumps([new_tool])


	def remove_parameter(tool):
	new_tool = tool.copy()
	if len(new_tool["function"]["parameters"]["properties"]) > 1:
	param_names = list(new_tool["function"]["parameters"]["properties"])
	to_remove = random.choice(param_names)
	new_tool["function"]["parameters"]["properties"].pop(to_remove)
	if ("required" in new_tool["function"]["parameters"]) and (to_remove in new_tool["function"]["parameters"]["required"]):
	new_tool["function"]["parameters"]["required"].remove(to_remove)
	return json.dumps([new_tool])


	def remove_tools(row):
	p = random.random()
	current_tool = json.loads(row["tools"])[0]
	if p < (1/3):
	# Scenario 1:
	# The provided tools cannot solve the query (e.g., query: "I want to know the weather in Palo Alto
	# on Dec 25, 2023," provided tool: get_house_price(city))
	row["tools"] = sample_new_tool(current_tool, tool_calls)
	row["answers"] = "The query cannot be answered with the provided tools."
	elif p < (2/3):
	# Scenario 2:
	# The provided tools are missing key arguments to solve the query (e.g., query: "I want to know the
	# weather in Palo Alto on Dec 25, 2023," provided tool: get_weather(city)).
	row["tools"] = remove_parameter(current_tool)
	row["answers"] = "The given question lacks the parameters required by the function."
	else:
	# Scenario 3: No tools provided
	row["tools"] = no_tool
	row["answers"] = "The query cannot be answered, no tools were provided."
	return row


	def add_random_tools(row):
	# Only update those that already have tools
	current_tools = json.loads(row["tools"])

	if len(current_tools) > 0:
	current_tool = current_tools[0]
	k = random.choices(tool_number, tool_prob)[0]

	new_tools = random.choices(tool_calls, k=k)
	new_tools.append(current_tool)
	row["tools"] = json.dumps(random.sample(new_tools, len(new_tools)))

	return row

	columns = ['func_name', 'func_desc', 'tools', 'query', 'answers', 'model_name', 'hash_id']
	ds_filtered = ds.filter(lambda x: x["keep_row_after_minhash_filtering"]).select_columns(columns)

	selection = ds_filtered.shuffle(422).select(range(6000))
	new_ds = concatenate_datasets(
	[

	selection.map(remove_tools, num_proc=4),
	ds
	]
	)
	new_ds = new_ds.map(add_random_tools, num_proc=4)
	print("Value counts:")
	new_df = new_ds.to_pandas()
	print(new_df["tools"].apply(lambda x: json.loads(x)).apply(len).value_counts())

	new_ds.push_to_hub("argilla-warehouse/synth-apigen-v0.2", private=True)