Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated the links to the datasets, and the notebooks. The improvements #3191

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"oa_stackexchange": "donfu/oa-stackexchange",
"stable_diffusion_instructional_dataset": "MadVoyager/stable_diffusion_instructional_dataset",
"ru_riddles_337": "0x22almostEvil/ru-riddles-377",
"instructional_codesearchnet_python": "Nan-Do/instructional_codesearchnet_python",
"instructional_codesearchnet_python": "Nan-Do/instructional_code-search-net-python",
"tatoeba_mt_qna_oa": "0x22almostEvil/tatoeba-mt-qna-oa",
"reasoning_bg_oa": "0x22almostEvil/reasoning_bg_oa",
"reasoning_gsm_qna_oa": "0x22almostEvil/reasoning-gsm-qna-oa",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,29 +1,16 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"!pip install datasets tqdm"
],
"execution_count": null,
"metadata": {
"id": "zLxBMw9Lsr6I"
"id": "zLxBMw9Lsr6I",
"scrolled": true
},
"execution_count": null,
"outputs": []
"outputs": [],
"source": [
"!pip install datasets tqdm lemminflect"
]
},
{
"cell_type": "code",
Expand All @@ -41,27 +28,36 @@
"from datasets import load_dataset\n",
"from tqdm.auto import tqdm\n",
"from random import random, randint\n",
"from lemminflect import getAllInflections, getLemma\n",
"\n",
"ONE_STEP_OUPUT_CODE_TEMPLATES = [\n",
" # VBZ\n",
" \"Can you write a program in {lang} where it\\n\",\n",
" \"How would you implement a function in {lang} that\\n\",\n",
" \"Write a {lang} function for\\n\",\n",
" \"Can you create a {lang} program that\\n\",\n",
" \"Can you implement a function in {lang} that\\n\",\n",
" # VBP\n",
" \"Implement a function in {lang} to\\n\",\n",
" \"Write a {lang} script for\\n\",\n",
" \"How would you code a program in {lang} to\\n\",\n",
" \"Create a {lang} function for\\n\",\n",
" \"Write a {lang} script to\\n\",\n",
" \"Create a {lang} function to\\n\",\n",
" \"Write a {lang} program that can\\n\",\n",
" \"Can you implement a function in {lang} that\\n\",\n",
" # VBG\n",
" \"Write a {lang} script for\\n\",\n",
" \"Write a {lang} function for\\n\",\n",
" \"Create a {lang} function for\\n\",\n",
" \"Implement a {lang} function for\\n\",\n",
"]\n",
"\n",
"ONE_STEP_OUPUT_SUMMARY_TEMPLATES = [\n",
" # General answer\n",
" \"Explain what the following {lang} code does\\n\",\n",
" \"Can you tell what is the following {lang} function doing\\n\",\n",
" \"Here you have a function in {lang}, explain what it does\\n\",\n",
" \"Make a summary of the following {lang} code\\n\",\n",
" \"Can you generate a brief explanation for the following {lang} code\\n\",\n",
" \"How would you explain what the following {lang} function does\\n\",\n",
" # Documentation\n",
" \"Can you generate the documentation for the following {lang} function\\n\",\n",
" \"Create a docstring for the following {lang} code\\n\",\n",
" \"Given the following {lang} function, write the documentation\\n\",\n",
Expand All @@ -88,9 +84,22 @@
" return \"\\n\".join([lines[0]] + lines[idx + 1 :])\n",
"\n",
"\n",
"def process_summary(summary, tag):\n",
" words = summary.split()\n",
" lemma = getLemma(words[0].lower(), upos=\"VERB\")[0]\n",
" inflections = getAllInflections(lemma)\n",
"\n",
" if tag not in inflections:\n",
" words[0] = words[0].lower()\n",
" else:\n",
" words[0] = inflections[tag][0]\n",
"\n",
" return \" \".join(words)\n",
"\n",
"\n",
"lang = \"Python 3\"\n",
"data = defaultdict(list)\n",
"dataset = load_dataset(\"Nan-Do/codesearchnet-python\")\n",
"dataset = load_dataset(\"Nan-Do/code-search-net-python\")\n",
"\n",
"for data_point in tqdm(dataset[\"train\"]):\n",
" code = data_point[\"original_string\"]\n",
Expand All @@ -99,15 +108,23 @@
" # Generate code\n",
" if random() > 0.5:\n",
" idx = randint(0, len(ONE_STEP_OUPUT_CODE_TEMPLATES) - 1)\n",
" if 0 <= idx <= 3:\n",
" tag = \"VBZ\"\n",
" elif 4 <= idx <= 8:\n",
" tag = \"VBP\"\n",
" else:\n",
" tag = \"VBG\"\n",
" summary = process_summary(summary, tag)\n",
" template = ONE_STEP_OUPUT_CODE_TEMPLATES[idx].format(lang=lang) + summary\n",
" data[\"INSTRUCTION\"].append(template)\n",
" data[\"RESPONSE\"].append(code)\n",
" # Generate summary\n",
" else:\n",
" # We are generating the docstring or a summary so we better remove it from\n",
" # the function\n",
" if random() < 0.9:\n",
" code = remove_docstring(code)\n",
" # if random() < 0.9:\n",
" # code = remove_docstring(code)\n",
" code = remove_docstring(code)\n",
" idx = randint(0, len(ONE_STEP_OUPUT_SUMMARY_TEMPLATES) - 1)\n",
" template = ONE_STEP_OUPUT_SUMMARY_TEMPLATES[idx].format(lang=lang) + code\n",
" data[\"INSTRUCTION\"].append(template)\n",
Expand All @@ -117,35 +134,66 @@
" data[\"RESPONSE\"].append('\"\"\"' + summary + '\"\"\"')\n",
"\n",
"df = pd.DataFrame(data=data)\n",
"df.to_parquet(\"dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
"df.to_parquet(\"instructional_dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_6jaUZRsy1-R"
},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
],
"metadata": {
"id": "_6jaUZRsy1-R"
},
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
"source": [
"from datasets import Dataset\n",
"\n",
"ds = Dataset.from_parquet(\"dataset.parquet\")\n",
"ds.push_to_hub(\"Nan-Do/open-assistant-codesearchnet-python\")"
],
"execution_count": null,
"metadata": {
"id": "DSHrvbF6tIyd"
},
"outputs": [],
"source": [
"from datasets import Dataset\n",
"\n",
"ds = Dataset.from_parquet(\"instructional_dataset.parquet\")\n",
"ds.push_to_hub(\"Nan-Do/instructional_code-search-net-python\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": []
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
]
},
"nbformat": 4,
"nbformat_minor": 1
}
5 changes: 3 additions & 2 deletions data/datasets/instructional_codesearchnet_python/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
This dataset is a template generated instructional Python datastet generated
from an annotated version of the code-search-net dataset. The annotated version
of code-search-net dataset can be found
[here](https://huggingface.co/datasets/Nan-Do/codesearchnet-python).
[here](https://huggingface.co/datasets/Nan-Do/code-search-net-python).

The dataset contains around 450000 python annotated functions. The dataset is
split into two blocks, one in which the task is starting from the annotated
Expand All @@ -14,7 +14,8 @@ been used.

**Note**: some summarisation tasks are very easy because the prompt already
contains a docstring in the function which is then used as the ground truth
response. It may be useful to filter these in future.
response. It may be useful to filter these in future. (All the docstrings have
been removed now)

### Summarize_codesearchnet_for_python.ipynb

Expand Down
Loading