|
6 | 6 | "metadata": {},
|
7 | 7 | "outputs": [],
|
8 | 8 | "source": [
|
| 9 | + "# Import necessary libraries\n", |
9 | 10 | "import numpy as np\n",
|
10 | 11 | "import pandas as pd\n",
|
11 | 12 | "import matplotlib.pyplot as plt\n",
|
|
22 | 23 | "metadata": {},
|
23 | 24 | "outputs": [],
|
24 | 25 | "source": [
|
25 |
| - "df=pd.read_csv(\"D:/Documents/Data Sets/film.csv\")" |
| 26 | + "# Load the dataset\n", |
| 27 | + "df=pd.read_csv(\"D:/Documents/Data Sets/movie_dataset.csv\")" |
26 | 28 | ]
|
27 | 29 | },
|
28 | 30 | {
|
|
31 | 33 | "metadata": {},
|
32 | 34 | "outputs": [],
|
33 | 35 | "source": [
|
| 36 | + "# Display the first 5 rows of the dataset\n", |
34 | 37 | "df.head(5)"
|
35 | 38 | ]
|
36 | 39 | },
|
|
40 | 43 | "metadata": {},
|
41 | 44 | "outputs": [],
|
42 | 45 | "source": [
|
| 46 | + "# Get the shape of the dataset\n", |
43 | 47 | "df.shape"
|
44 | 48 | ]
|
45 | 49 | },
|
|
49 | 53 | "metadata": {},
|
50 | 54 | "outputs": [],
|
51 | 55 | "source": [
|
| 56 | + "# Check for missing values in the dataset\n", |
52 | 57 | "df.isnull().sum()"
|
53 | 58 | ]
|
54 | 59 | },
|
|
58 | 63 | "metadata": {},
|
59 | 64 | "outputs": [],
|
60 | 65 | "source": [
|
| 66 | + "# Drop rows with missing values\n", |
61 | 67 | "df.dropna(inplace=True)"
|
62 | 68 | ]
|
63 | 69 | },
|
|
67 | 73 | "metadata": {},
|
68 | 74 | "outputs": [],
|
69 | 75 | "source": [
|
| 76 | + "# Verify that there are no more missing values\n", |
70 | 77 | "df.isnull().sum()"
|
71 | 78 | ]
|
72 | 79 | },
|
|
76 | 83 | "metadata": {},
|
77 | 84 | "outputs": [],
|
78 | 85 | "source": [
|
| 86 | + "# Display information about the dataset\n", |
79 | 87 | "df.info()"
|
80 | 88 | ]
|
81 | 89 | },
|
|
85 | 93 | "metadata": {},
|
86 | 94 | "outputs": [],
|
87 | 95 | "source": [
|
88 |
| - "cor=df['Budget'].corr(df['Revenue'])\n", |
| 96 | + "# Calculate and display correlation between budget and revenue\n", |
| 97 | + "cor=df['budget'].corr(df['revenue'])\n", |
89 | 98 | "cor"
|
90 | 99 | ]
|
91 | 100 | },
|
|
95 | 104 | "metadata": {},
|
96 | 105 | "outputs": [],
|
97 | 106 | "source": [
|
| 107 | + "# Encode categorical variables using Label Encoding\n", |
98 | 108 | "lr=preprocessing.LabelEncoder()\n",
|
99 |
| - "df['Title']=lr.fit_transform(df['Title'])\n", |
100 |
| - "df['Original Title']=lr.fit_transform(df['Original Title'])\n", |
101 |
| - "df['Original Language']=lr.fit_transform(df['Original Language'])\n", |
102 |
| - "df['Status']=lr.fit_transform(df['Status'])\n", |
103 |
| - "df['Spoken Languages']=lr.fit_transform(df['Spoken Languages'])\n", |
104 |
| - "df['Production Countries']=lr.fit_transform(df['Production Countries'])\n", |
105 |
| - "df['Production Companies']=lr.fit_transform(df['Production Companies'])\n", |
106 |
| - "df['Genres']=lr.fit_transform(df['Genres'])\n", |
107 |
| - "df['Overview']=lr.fit_transform(df['Overview'])\n", |
108 |
| - "df['Release Date']=lr.fit_transform(df['Release Date'])\n", |
109 |
| - "df['Adult']=lr.fit_transform(df['Adult'])\n" |
| 109 | + "df['title']=lr.fit_transform(df['title'])\n", |
| 110 | + "df['original_title']=lr.fit_transform(df['original_title'])\n", |
| 111 | + "df['original_language']=lr.fit_transform(df['original_language'])\n", |
| 112 | + "df['status']=lr.fit_transform(df['status'])\n", |
| 113 | + "df['spoken_languages']=lr.fit_transform(df['spoken_languages'])\n", |
| 114 | + "df['production_countries']=lr.fit_transform(df['production_countries'])\n", |
| 115 | + "df['production_companies']=lr.fit_transform(df['production_companies'])\n", |
| 116 | + "df['genres']=lr.fit_transform(df['genres'])\n", |
| 117 | + "df['overview']=lr.fit_transform(df['overview'])\n", |
| 118 | + "df['release_date']=lr.fit_transform(df['release_date'])\n" |
110 | 119 | ]
|
111 | 120 | },
|
112 | 121 | {
|
|
115 | 124 | "metadata": {},
|
116 | 125 | "outputs": [],
|
117 | 126 | "source": [
|
| 127 | + "# Display information about the dataset after encoding\n", |
118 | 128 | "df.info()"
|
119 | 129 | ]
|
120 | 130 | },
|
|
124 | 134 | "metadata": {},
|
125 | 135 | "outputs": [],
|
126 | 136 | "source": [
|
127 |
| - "sns.heatmap(data=df)" |
| 137 | + "# Create and display a heatmap of feature correlations\n", |
| 138 | + "correlation_matrix = df.select_dtypes(include=[np.number]).corr()\n", |
| 139 | + "plt.figure(figsize=(14, 10))\n", |
| 140 | + "sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True, cbar_kws={\"shrink\": .8})\n", |
| 141 | + "plt.title('Heatmap of Feature Correlations', fontsize=20)\n", |
| 142 | + "plt.xticks(rotation=45, ha='right')\n", |
| 143 | + "plt.yticks(rotation=0)\n", |
| 144 | + "plt.tight_layout()\n", |
| 145 | + "plt.show()" |
128 | 146 | ]
|
129 | 147 | },
|
130 | 148 | {
|
|
133 | 151 | "metadata": {},
|
134 | 152 | "outputs": [],
|
135 | 153 | "source": [
|
136 |
| - "X=df[['Budget','Popularity','Runtime']]\n", |
137 |
| - "Y=df['Revenue']\n" |
| 154 | + "# Define features and target variable for the model\n", |
| 155 | + "X=df[['budget','popularity','runtime']]\n", |
| 156 | + "Y=df['revenue']\n" |
138 | 157 | ]
|
139 | 158 | },
|
140 | 159 | {
|
|
143 | 162 | "metadata": {},
|
144 | 163 | "outputs": [],
|
145 | 164 | "source": [
|
| 165 | + "# Split the data into training and testing sets\n", |
146 | 166 | "x_train, x_test, y_train, y_test=train_test_split(X,Y, test_size=0.4)"
|
147 | 167 | ]
|
148 | 168 | },
|
|
152 | 172 | "metadata": {},
|
153 | 173 | "outputs": [],
|
154 | 174 | "source": [
|
| 175 | + "# Create a Linear Regression model\n", |
155 | 176 | "lr=LinearRegression()"
|
156 | 177 | ]
|
157 | 178 | },
|
|
161 | 182 | "metadata": {},
|
162 | 183 | "outputs": [],
|
163 | 184 | "source": [
|
| 185 | + "# Fit the model to the training data\n", |
164 | 186 | "lr.fit(x_train, y_train)"
|
165 | 187 | ]
|
166 | 188 | },
|
|
170 | 192 | "metadata": {},
|
171 | 193 | "outputs": [],
|
172 | 194 | "source": [
|
| 195 | + "# Make predictions on the testing set\n", |
173 | 196 | "pred=lr.predict(x_test)"
|
174 | 197 | ]
|
175 | 198 | },
|
|
179 | 202 | "metadata": {},
|
180 | 203 | "outputs": [],
|
181 | 204 | "source": [
|
| 205 | + "# Evaluate the model using Mean Absolute Error\n", |
182 | 206 | "print(metrics.mean_absolute_error(y_test, pred))"
|
183 | 207 | ]
|
184 | 208 | }
|
|
0 commit comments