Add files via upload

fatimakhann · web-flow · commit 69e68a4008d6 · 2024-06-17T18:50:26.000+02:00
diff --git a/PythonNotebook_CodingTutorialDocumented.ipynb b/PythonNotebook_CodingTutorialDocumented.ipynb
@@ -1,200 +1 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "71ea34bd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3bd1661e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv('FilterandFireData.csv')\n",
-    "display(df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4887ed3c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#extract those with 40ms to specifically extract the data\n",
-    "forty_time = five_connections[five_connections['Time'] == 40]\n",
-    "display(forty_time)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ced5f652",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#barplot\n",
-    "mean_accuracies = forty_time.groupby('digit')['Accuracy.FF'].mean()\n",
-    "plt.figure(figsize=(10, 6)) # for changing the figure size\n",
-    "mean_accuracies.plot(kind='bar', color= 'green')\n",
-    "plt.title('Mean Accuracy for Each Digit (0-9)')\n",
-    "plt.xlabel('Digits')\n",
-    "plt.ylabel('Mean Accuracy.FF')\n",
-    "plt.ylim(85,100) \n",
-    "plt.grid(axis='y')\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e09f0511",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#boxplot\n",
-    "plt.figure(figsize=(10,6))\n",
-    "#df.boxplot(column='Accuracy.FF', by='digit', grid = False)\n",
-    "sns.boxplot(x='digit', y='Accuracy.FF', data=df, palette='pastel')\n",
-    "plt.title('Boxplot of Accuracy for Each Digit (0-9)')\n",
-    "plt.suptitle('')  # Suppress the default title to avoid duplication\n",
-    "plt.xlabel('Digits')\n",
-    "plt.ylabel('Accuracy.FF')\n",
-    "plt.ylim(85, 100)  # Set the y-axis range\n",
-    "plt.grid(axis='y')\n",
-    "plt.grid(axis='x')\n",
-    "plt.show()"
-   ]
-},
-{
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3dfdeb10",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#violin plot\n",
-    "plt.figure(figsize=(10,6))\n",
-    "sns.violinplot(x='digit', y='Accuracy.FF', data=df, inner=\"quartile\", palette='pastel')\n",
-    "plt.title('Violin Plot of Accuracy for Each Digit (0-9)')\n",
-    "plt.xlabel('Digits')\n",
-    "plt.ylabel('Accuracy')\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "915a4518",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#histogram\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.hist(digit_9['Accuracy.FF'], bins=20, color='skyblue', edgecolor='black')\n",
-    "plt.title('Histogram of Accuracy')\n",
-    "plt.xlabel('Accuracy')\n",
-    "plt.ylabel('Frequency')\n",
-    "plt.grid(axis='y')\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6941a7ce",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "genes_data = pd.read_csv('heatmapExpressionData.csv')\n",
-    "display(genes_data)\n",
-    "genes = genes_data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "33e310ca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "genes.set_index('GeneName', inplace=True)\n",
-    "\n",
-    "#heatmap\n",
-    "plt.figure(figsize=(12, 8)) #figsize 25,30 adequate to display all genes here\n",
-    "sns.heatmap(genes, cmap='viridis',linewidths=0.5) #,yticklabels=True \n",
-    "plt.title('Heatmap of Gene Expression Data')\n",
-    "plt.xlabel('Samples')\n",
-    "plt.ylabel('Genes')\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9521c6be",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.figure(figsize=(10, 6))\n",
-    "sns.scatterplot(x='NAc1', y='NAc4', data=genes)\n",
-    "plt.title('Scatter Plot of Gene Expression')\n",
-    "plt.grid(True)\n",
-    "\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b2865082",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "transposed_genes = genes.set_index('GeneName').transpose()\n",
-    "\n",
-    "# Plot a line plot of the gene expression data\n",
-    "plt.figure(figsize=(12, 8))\n",
-    "#for gene in transposed_genes.columns:\n",
-    "    #plt.plot(transposed_genes.index, transposed_genes[gene], label=gene)\n",
-    "transposed_genes.plot(legend=False, alpha=0.5)\n",
-    "plt.title('Line Plot of Gene Expression Data')\n",
-    "plt.xlabel('Samples')\n",
-    "plt.ylabel('Gene Expression')\n",
-    "#plt.legend(loc='upper right', bbox_to_anchor=(1.25, 1))\n",
-    "#plt.grid(True)\n",
-    "plt.show()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+{"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":5,"nbformat":4,"cells":[{"id":"0a79625e-04d8-4435-82d6-d8e05bc2b63a","cell_type":"markdown","source":"### General instructions.\n\nIn this interactive tutorial, you can run each one of the cells by either clicking the ‘play’ button or by pressing ‘Shift + Enter’. You can make changes to the code as well.\n\n# Graphical Forms of Data Charts: Dataset 1\n\n## Filter and Fire dataset\n\nRead and observe the Filter and Fire dataset in Python:","metadata":{}},{"id":"3bd1661e","cell_type":"code","source":"df = pd.read_csv('FilterandFireData.csv')\ndisplay(df) #displays what the data looks like. 'print(df.head())' only only the first few rows ","metadata":{"trusted":true,"scrolled":true},"outputs":[],"execution_count":null},{"id":"2af14fe6-7af0-469c-9c1b-39f120309f64","cell_type":"markdown","source":"Since the data file is in the same repository, we can directly call it. \nIf it was in another folder, a path to the respective foler would have to be added.","metadata":{}},{"id":"71ea34bd","cell_type":"code","source":"import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt","metadata":{"trusted":true},"outputs":[],"execution_count":1},{"id":"c043c180-ca8b-412c-af42-e55c2ad1c2ed","cell_type":"markdown","source":"Python needs to import ceratin packages that help in executing commands of different types. \nWe import the packages e.g. pandas and whereever we call it, we use 'pd' for convenience.","metadata":{}},{"id":"a2e489b2-766e-4b08-bb1d-5c9e38131683","cell_type":"markdown","source":"## Barplot\nWe can now start by observing the Baseline Accuracy displayed by neurons\nfor the detection of each handwritten digit by making a Bar plot.","metadata":{}},{"id":"ced5f652","cell_type":"code","source":"mean_accuracies = df.groupby('digit')['Accuracy.FF'].mean()\nplt.figure(figsize=(10, 6)) # for changing the figure size\nmean_accuracies.plot(kind='bar', color= 'orange')\nplt.title('Mean Accuracy for Each Digit (0-9)')\nplt.xlabel('Digits')\nplt.ylabel('Mean Accuracy.FF')\nplt.ylim(85,100) \nplt.grid(axis='y')\nplt.show()","metadata":{"trusted":true,"scrolled":true},"outputs":[],"execution_count":null},{"id":"caca3c35-a0df-4ff2-80cb-96852e08f46b","cell_type":"markdown","source":"The *groupby* function take the respective category and groups the data accordignly. Here, we first group mean of each digit and then plot them as a bargraph giving it a specific color.\n\nAs you can see, this is the simpliest form of a plot. You set the variable for a desired category and plot the saved variable in the form of a graph. \n\n* Exercice: Change the color of the bar plot.","metadata":{}},{"id":"7c309042-60e2-4174-b65c-71cd074a85de","cell_type":"markdown","source":"## Box plot","metadata":{}},{"id":"e09f0511","cell_type":"code","source":"plt.figure(figsize=(10,6))\n#df.boxplot(column='Accuracy.FF', by='digit', grid = False)\nsns.boxplot(x='digit', y='Accuracy.FF', data=df, palette='pastel')\nplt.title('Boxplot of Accuracy for Each Digit (0-9)')\nplt.suptitle('')  # Suppress the default title to avoid duplication\nplt.xlabel('Digits')\nplt.ylabel('Accuracy.FF')\nplt.ylim(85, 100)  # Set the y-axis range\nplt.grid(axis='y')\nplt.grid(axis='x')\nplt.show()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"id":"0673cbbe-2d63-49da-b3c0-d7eb03fa8073","cell_type":"markdown","source":"The *seaborn* package provides additional visualization tools such as color palettes. This boxplot uses it for colored palettes, however, it does not provide addiditonal details in the graph, so it can be removed. Sns also requires certain categories for its default execution. \n\n* Exercise: \ni) What would you do to remove the color palette?\n\nii) Can you find the variables sns would not work without? What happens if you remove them?","metadata":{}},{"id":"10ca5ee5-f93f-4be5-9129-62f633c10671","cell_type":"markdown","source":"## Violin plot","metadata":{}},{"id":"3dfdeb10","cell_type":"code","source":"plt.figure(figsize=(10,6))\nsns.violinplot(x='digit', y='Accuracy.FF', data=df, inner=\"quartile\", palette='pastel')\nplt.title('Violin Plot of Accuracy for Each Digit (0-9)')\nplt.xlabel('Digits')\nplt.ylabel('Accuracy')\nplt.show()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"id":"e179cb8c-6c59-4809-af8d-89cee5743cfa","cell_type":"markdown","source":"## Histogram\nWe will now plot a histogram but only for the values that were trained with the **digit 9**","metadata":{}},{"id":"915a4518","cell_type":"code","source":"digit_9 = df[df['digit'] == 9] #subset digit 9 data\n#display(digit_9)\nplt.figure(figsize=(10, 6))\nplt.hist(digit_9['Accuracy.FF'], bins=20, color='skyblue', edgecolor='black')\nplt.title('Histogram of Accuracy')\nplt.xlabel('Accuracy')\nplt.ylabel('Frequency')\nplt.grid(axis='y')\nplt.show()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"id":"b3f98e96-96c9-4e9a-9b27-3474d840404f","cell_type":"markdown","source":"In histograms, bins define the division of the histogram into bars. The number of bins change the detail of visualization throughout the data. \n\n* Exercise: What happens if you change the bins?","metadata":{}},{"id":"5ab36e55-6426-4e9e-b34c-52977c2607c3","cell_type":"markdown","source":"# Graphical Forms of Data Charts: Dataset 2\n\n## Brain region-specific Gene Expression\n\nRead and observe the Brain region-specific Gene Expression data in Python:","metadata":{}},{"id":"6941a7ce","cell_type":"code","source":"genes = pd.read_csv('ExpressionData.txt', sep='\\t', index_col=0)\ndisplay(genes)","metadata":{"trusted":true,"scrolled":true},"outputs":[],"execution_count":null},{"id":"93b83950-e11f-4a33-928d-b0cd1d622d22","cell_type":"markdown","source":"Pandas package is used to read the data files. The *read_csv* function in pandas is a versatile function used to read data from a variety of file formats, not just CSV files, in this case .txt file. The delimiter used, sep='\\t' for tab for a clear visual display. *index_col=0* uses the first column as the index. This is because a heatmap function requires a matrix as input, but our ‘expressiondata’ object is a list. \n\n## Heatmap","metadata":{}},{"id":"33e310ca","cell_type":"code","source":"plt.figure(figsize=(12, 8)) #figsize 25,30 \nsns.heatmap(genes, cmap='viridis', annot=False, cbar=True) #,yticklabels=True ,linewidths=0.5\nplt.title('Heatmap of Gene Expression Data')\nplt.xlabel('Samples')\nplt.ylabel('Genes')\nplt.show()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"id":"fc49d9c0-6032-4277-b188-9f7b7fccb4a6","cell_type":"markdown","source":"The sequence of code matters. Here, the first line forms the base for a heatmap execution. You could also run this without the *plt* function line but It would change the display.\n\n* Exercise: i) What is compromised if you run without the first line?    \nii) How could you incorporate maximum gene names on the display?\n\n## Scatterplot","metadata":{}},{"id":"9521c6be","cell_type":"code","source":"plt.figure(figsize=(10, 6))\nsns.scatterplot(x='NAc1', y='NAc4', data=genes)\nplt.title('Scatter Plot of Gene Expression')\nplt.grid(True)\n\nplt.show()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"id":"8a292f04-cb8b-48b4-aaeb-1f691390caf9","cell_type":"markdown","source":"We have plotted the same brain region of two different samples (Nucleus Accumbens for rat1 and rat4), we can see both of them are highly correlated with each other.\n\n* Exercise: Plot the NAc expression data for any sample against DG of the same sample.\n\\\nWhat is the dispersion like? Is the expression between the two brain regions correlated with each other? Is\nit what you expected?\n\n## Line plot","metadata":{}},{"id":"b2865082","cell_type":"code","source":"genes_T = genes.T #Transpose the data\nplt.figure(figsize=(12, 8))\n#for gene in genes_T.columns:\n    #plt.plot(genes_T.index, genes_T[gene], label=gene)\ngenes_T.plot(legend=False, alpha=0.5)\nplt.title('Line Plot of Gene Expression Data')\nplt.xlabel('Samples')\nplt.ylabel('Gene Expression')\n#plt.legend(loc='upper right', bbox_to_anchor=(1.25, 1))\n#plt.grid(True)\nplt.show()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"id":"a87b1b53-9e2a-4fe6-aa13-fa7ff851695d","cell_type":"markdown","source":"* Exercise: i) Try the code without transposing the data and see the difference. Can you explain why or why would it not make a difference?\n          ii) Uncomment other line of codes. What do you observe?\n\n\nNote: you can also use a *melt* function to a long format for seaborn. This resets the data frame for a line plot. ","metadata":{}},{"id":"ab8ee4b1-8d79-4222-8c31-ee4d91e860a0","cell_type":"markdown","source":"### Advanced excercises.\nIf you'd like to have an extra challenge, we suggest you to download the original datasets. You can then try to replicate the plots from the research papers. \n* Filter and Fire original Dataset: https://www.kaggle.com/datasets/selfishgene/fiter-and-fire-paper\n* Brain region-specific expression data original Dataset (Fig1d.Region_sepcific_expressed_Gene_cpm_Zscore.txt file): https://figshare.com/projects/Methamphetamine-induced_region-specific_transcriptomic_and_epigenetic_changes_in_the_brain_of_male_rats/177378","metadata":{}}]}