-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnormalize_datasets.py
165 lines (136 loc) · 5.76 KB
/
normalize_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
import os
from datetime import datetime
def normalize_date(date_str):
"""Convert various date formats to YYYY-MM-DD."""
try:
return pd.to_datetime(date_str).strftime('%Y-%m-%d')
except:
return None
def process_hardware_companies():
"""Process files from hardware-companies.zip"""
dfs = []
# Map of filename patterns to stock symbols
symbol_map = {
'AMD': 'AMD',
'INTEL': 'INTC',
'NVIDIA': 'NVDA',
'ASUS': 'ASUS',
'MSI': 'MSI'
}
for file in os.listdir('datasets'):
if file.endswith('.csv'):
for company in symbol_map:
if company in file.upper():
try:
df = pd.read_csv(f'datasets/{file}')
print(f"Columns in {file}: {df.columns.tolist()}")
df['stockSymbol'] = symbol_map[company]
# Handle Close column
if 'Close' in df.columns:
df = df.drop('Adj Close', axis=1, errors='ignore')
elif 'Adj Close' in df.columns:
df['Close'] = df['Adj Close']
df = df.drop('Adj Close', axis=1)
# Remove any remaining duplicate columns
df = df.loc[:,~df.columns.duplicated()]
dfs.append(df)
except Exception as e:
print(f"Error processing {file}: {e}")
if dfs:
combined = pd.concat(dfs, ignore_index=True)
return combined
return pd.DataFrame()
def process_nasdaq_companies():
"""Process files from nasdaq-companies.zip"""
try:
companies = pd.read_csv('datasets/Company.csv')
values = pd.read_csv('datasets/CompanyValues.csv')
print(f"NASDAQ values columns: {values.columns.tolist()}")
merged = values.merge(companies, on='ticker_symbol', how='left')
merged = merged.rename(columns={
'ticker_symbol': 'stockSymbol',
'day_date': 'Date',
'close_value': 'Close',
'volume': 'Volume',
'open_value': 'Open',
'high_value': 'High',
'low_value': 'Low'
})
return merged
except Exception as e:
print(f"Error processing NASDAQ data: {e}")
return pd.DataFrame()
def process_top_10_companies():
"""Process data.csv from top-10-companies.zip"""
try:
df = pd.read_csv('datasets/data.csv')
print(f"Top 10 columns: {df.columns.tolist()}")
# Map company names to stock symbols
company_to_symbol = {
'APPLE': 'AAPL',
'MICROSOFT': 'MSFT',
'AMAZON': 'AMZN',
'CISCO': 'CSCO',
'MICRON': 'MU'
}
# Convert company names to uppercase for matching
df['Company'] = df['Company'].str.upper()
df['stockSymbol'] = df['Company'].map(company_to_symbol)
# Rename Close/Last to Close
df = df.rename(columns={'Close/Last': 'Close'})
return df
except Exception as e:
print(f"Error processing top 10 companies data: {e}")
return pd.DataFrame()
def main():
# Process all datasets
hardware_df = process_hardware_companies()
nasdaq_df = process_nasdaq_companies()
top10_df = process_top_10_companies()
# List of required columns in final output
required_columns = ['Date', 'stockSymbol', 'Open', 'High', 'Low', 'Close', 'Volume']
# Normalize each dataframe
dfs = []
for df, name in [(hardware_df, 'hardware'), (nasdaq_df, 'nasdaq'), (top10_df, 'top10')]:
if not df.empty:
print(f"\nProcessing {name} dataset")
print(f"Available columns: {df.columns.tolist()}")
# Check if all required columns exist
missing_cols = set(required_columns) - set(df.columns)
if missing_cols:
print(f"Warning: {name} dataset is missing columns: {missing_cols}")
continue
try:
# Select only the required columns and remove duplicates
df = df[required_columns].copy()
df = df.drop_duplicates()
dfs.append(df)
except Exception as e:
print(f"Error processing {name} dataset: {e}")
# Combine all dataframes
if dfs:
try:
final_df = pd.concat(dfs, ignore_index=True)
# Normalize data types
final_df['Date'] = final_df['Date'].apply(normalize_date)
# Clean up numeric columns
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
for col in numeric_columns:
# Remove any currency symbols or commas
if final_df[col].dtype == 'object':
final_df[col] = final_df[col].astype(str).str.replace('$', '').str.replace(',', '')
final_df[col] = pd.to_numeric(final_df[col], errors='coerce')
# Remove rows with missing values
final_df = final_df.dropna()
# Sort by date and stock symbol
final_df = final_df.sort_values(['stockSymbol', 'Date'])
# Save to CSV
final_df.to_csv('datasets/normalized_financial_data.csv', index=False)
print(f"\nSuccessfully normalized {len(final_df)} rows of data")
except Exception as e:
print(f"\nError in final processing: {e}")
else:
print("\nNo data was processed successfully")
if __name__ == "__main__":
main()