Skip to content

Commit

Permalink
Merge pull request #2 from Samoed/parser
Browse files Browse the repository at this point in the history
json to csv
  • Loading branch information
Samoed authored Apr 19, 2024
2 parents 862368c + cd05d4e commit 26be306
Showing 1 changed file with 139 additions and 0 deletions.
139 changes: 139 additions & 0 deletions notebooks/combine_repos_jsons.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_88815/3029550948.py:4: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm\n"
]
}
],
"source": [
"import json\n",
"import os\n",
"\n",
"import pandas as pd\n",
"from tqdm.autonotebook import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"repos_path = \"../src/parser/data\"\n",
"out_dir = \"data\"\n",
"closed_prs_dir = os.path.join(out_dir, \"closed_prs\")\n",
"merged_prs_dir = os.path.join(out_dir, \"merged_prs\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(out_dir):\n",
" os.makedirs(out_dir)\n",
" os.makedirs(closed_prs_dir)\n",
" os.makedirs(merged_prs_dir)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5e56a473b5f84ca19bbaedc0246f26c6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/86 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"file_extension = \".gzip\"\n",
"export_params = {\"index\": False, \"escapechar\": \"@\"} # , \"compression\": file_extension[1:]\n",
"\n",
"all_repos = os.listdir(repos_path)\n",
"for repo_name in tqdm(all_repos):\n",
" repo_path = os.path.join(repos_path, repo_name)\n",
" if not os.path.isdir(repo_path):\n",
" continue\n",
"\n",
" files = os.listdir(repo_path)\n",
" df = None\n",
" for file in files:\n",
" if not file.endswith(\".json\"):\n",
" continue\n",
"\n",
" file_path = os.path.join(repo_path, file)\n",
" with open(file_path) as f:\n",
" data = json.load(f)\n",
"\n",
" df = pd.DataFrame(data) if df is None else pd.concat([df, pd.DataFrame(data)])\n",
"\n",
" # df.to_parquet(\n",
" # os.path.join(closed_prs_dir, repo_name + \".parquet\"), # + file_extension\n",
" # **export_params,\n",
" # )\n",
" df.to_csv(\n",
" os.path.join(closed_prs_dir, repo_name), # + file_extension\n",
" **export_params,\n",
" )\n",
" df = df[df[\"merged_at\"].notna()]\n",
" df.to_csv(\n",
" os.path.join(merged_prs_dir, repo_name), # + file_extension\n",
" **export_params,\n",
" )\n",
" # df.to_parquet(\n",
" # os.path.join(merged_prs_dir, repo_name + \".parquet\"), # + file_extension\n",
" # **export_params,\n",
" # )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 26be306

Please sign in to comment.