|
997 | 997 | "id": "e5b77a38",
|
998 | 998 | "metadata": {},
|
999 | 999 | "source": [
|
1000 |
| - "[Link to DuckDB](https://bit.ly/4dJxNHV)." |
| 1000 | + "[Link to DuckDB](https://github.com/duckdb/duckdb)." |
| 1001 | + ] |
| 1002 | + }, |
| 1003 | + { |
| 1004 | + "cell_type": "markdown", |
| 1005 | + "id": "3be9869b-68e7-453d-bdba-6c678d0482d3", |
| 1006 | + "metadata": {}, |
| 1007 | + "source": [ |
| 1008 | + "### DuckDB: Query Pandas DataFrames Faster with Columnar Storage" |
| 1009 | + ] |
| 1010 | + }, |
| 1011 | + { |
| 1012 | + "cell_type": "code", |
| 1013 | + "execution_count": null, |
| 1014 | + "id": "59a4e2bb-2c3c-4276-a163-a98f1b15625c", |
| 1015 | + "metadata": { |
| 1016 | + "editable": true, |
| 1017 | + "slideshow": { |
| 1018 | + "slide_type": "" |
| 1019 | + }, |
| 1020 | + "tags": [ |
| 1021 | + "hide-cell" |
| 1022 | + ] |
| 1023 | + }, |
| 1024 | + "outputs": [], |
| 1025 | + "source": [ |
| 1026 | + "!pip install duckdb" |
| 1027 | + ] |
| 1028 | + }, |
| 1029 | + { |
| 1030 | + "cell_type": "markdown", |
| 1031 | + "id": "484e2a53-aae2-4849-9182-80ca95ea6026", |
| 1032 | + "metadata": {}, |
| 1033 | + "source": [ |
| 1034 | + "When analyzing data with operations like GROUP BY, SUM, or AVG on specific columns, row-based storage results in reading unnecessary data and inefficient memory usage since entire rows must be loaded even when only a few columns are needed.\n", |
| 1035 | + "\n", |
| 1036 | + "Example using SQLite (row-based):" |
| 1037 | + ] |
| 1038 | + }, |
| 1039 | + { |
| 1040 | + "cell_type": "code", |
| 1041 | + "execution_count": 7, |
| 1042 | + "id": "1171bdc1-0009-406a-be97-896eb0ed6ee5", |
| 1043 | + "metadata": {}, |
| 1044 | + "outputs": [], |
| 1045 | + "source": [ |
| 1046 | + "import sqlite3\n", |
| 1047 | + "import pandas as pd\n", |
| 1048 | + "\n", |
| 1049 | + "customer = pd.DataFrame({\n", |
| 1050 | + " \"id\": [1, 2, 3],\n", |
| 1051 | + " \"name\": [\"Alex\", \"Ben\", \"Chase\"],\n", |
| 1052 | + " \"age\": [25, 30, 35]\n", |
| 1053 | + "})\n", |
| 1054 | + "\n", |
| 1055 | + "# Load data to SQLite and query\n", |
| 1056 | + "conn = sqlite3.connect(':memory:')\n", |
| 1057 | + "customer.to_sql('customer', conn, index=False)\n", |
| 1058 | + "\n", |
| 1059 | + "# Must read all columns internally even though we only need 'age'\n", |
| 1060 | + "query = \"SELECT age FROM customer\"\n", |
| 1061 | + "result = pd.read_sql(query, conn)" |
| 1062 | + ] |
| 1063 | + }, |
| 1064 | + { |
| 1065 | + "cell_type": "markdown", |
| 1066 | + "id": "56d499ac-7323-493f-859e-7921a052fdc7", |
| 1067 | + "metadata": {}, |
| 1068 | + "source": [ |
| 1069 | + "DuckDB uses columnar storage, allowing you to efficiently read and process only the columns needed for your analysis. This improves both query speed and memory usage:" |
| 1070 | + ] |
| 1071 | + }, |
| 1072 | + { |
| 1073 | + "cell_type": "code", |
| 1074 | + "execution_count": 6, |
| 1075 | + "id": "ae65be7f-0e4b-485e-b403-4a2943a24578", |
| 1076 | + "metadata": {}, |
| 1077 | + "outputs": [ |
| 1078 | + { |
| 1079 | + "data": { |
| 1080 | + "text/html": [ |
| 1081 | + "<div>\n", |
| 1082 | + "<style scoped>\n", |
| 1083 | + " .dataframe tbody tr th:only-of-type {\n", |
| 1084 | + " vertical-align: middle;\n", |
| 1085 | + " }\n", |
| 1086 | + "\n", |
| 1087 | + " .dataframe tbody tr th {\n", |
| 1088 | + " vertical-align: top;\n", |
| 1089 | + " }\n", |
| 1090 | + "\n", |
| 1091 | + " .dataframe thead th {\n", |
| 1092 | + " text-align: right;\n", |
| 1093 | + " }\n", |
| 1094 | + "</style>\n", |
| 1095 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 1096 | + " <thead>\n", |
| 1097 | + " <tr style=\"text-align: right;\">\n", |
| 1098 | + " <th></th>\n", |
| 1099 | + " <th>age</th>\n", |
| 1100 | + " </tr>\n", |
| 1101 | + " </thead>\n", |
| 1102 | + " <tbody>\n", |
| 1103 | + " <tr>\n", |
| 1104 | + " <th>0</th>\n", |
| 1105 | + " <td>25</td>\n", |
| 1106 | + " </tr>\n", |
| 1107 | + " <tr>\n", |
| 1108 | + " <th>1</th>\n", |
| 1109 | + " <td>30</td>\n", |
| 1110 | + " </tr>\n", |
| 1111 | + " <tr>\n", |
| 1112 | + " <th>2</th>\n", |
| 1113 | + " <td>35</td>\n", |
| 1114 | + " </tr>\n", |
| 1115 | + " </tbody>\n", |
| 1116 | + "</table>\n", |
| 1117 | + "</div>" |
| 1118 | + ], |
| 1119 | + "text/plain": [ |
| 1120 | + " age\n", |
| 1121 | + "0 25\n", |
| 1122 | + "1 30\n", |
| 1123 | + "2 35" |
| 1124 | + ] |
| 1125 | + }, |
| 1126 | + "execution_count": 6, |
| 1127 | + "metadata": {}, |
| 1128 | + "output_type": "execute_result" |
| 1129 | + } |
| 1130 | + ], |
| 1131 | + "source": [ |
| 1132 | + "import duckdb\n", |
| 1133 | + "import pandas as pd\n", |
| 1134 | + "\n", |
| 1135 | + "customer = pd.DataFrame({\n", |
| 1136 | + " \"id\": [1, 2, 3],\n", |
| 1137 | + " \"name\": [\"Alex\", \"Ben\", \"Chase\"],\n", |
| 1138 | + " \"age\": [25, 30, 35]\n", |
| 1139 | + "})\n", |
| 1140 | + "\n", |
| 1141 | + "\n", |
| 1142 | + "query = \"SELECT age FROM customer\"\n", |
| 1143 | + "result = duckdb.sql(query).df()\n", |
| 1144 | + "result" |
| 1145 | + ] |
| 1146 | + }, |
| 1147 | + { |
| 1148 | + "cell_type": "markdown", |
| 1149 | + "id": "ada7a04d-783d-4599-842e-4160b3cdd58d", |
| 1150 | + "metadata": {}, |
| 1151 | + "source": [ |
| 1152 | + "In this example, DuckDB only needs to access the 'age' column in memory, while SQLite must read all columns ('id', 'name', 'age') internally even though only 'age' is selected. DuckDB also provides a simpler workflow by querying pandas DataFrames directly." |
| 1153 | + ] |
| 1154 | + }, |
| 1155 | + { |
| 1156 | + "cell_type": "markdown", |
| 1157 | + "id": "1f895dd7-0a7c-471d-a6b4-94b8e5d18748", |
| 1158 | + "metadata": {}, |
| 1159 | + "source": [ |
| 1160 | + "[Link to DuckDB](https://github.com/duckdb/duckdb)." |
1001 | 1161 | ]
|
1002 | 1162 | },
|
1003 | 1163 | {
|
|
0 commit comments