forked from pranav-suri/pbl2-ship-delay
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
118 lines (98 loc) · 4.8 KB
/
utils.py
File metadata and controls
118 lines (98 loc) · 4.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# utils.py
import pandas as pd
import numpy as np
# --- Make these match your training configuration ---
FORECAST_HORIZON_HOURS = 48
TIME_WINDOWS = [6, 12, 24, 48]
WEATHER_VARS_TO_SUMMARIZE = [
"wind_speed_knots",
"visibility_nm",
"wave_height_m",
] # Add/remove based on training
CRITICAL_STATES = ["Fog", "HighWind"] # Add/remove based on training
# --- End of configuration ---
def calculate_weather_features_from_forecast(arrival_ts, hourly_forecast_list):
"""
Calculates weather summary features from a list of hourly forecast dictionaries.
Args:
arrival_ts (pd.Timestamp): The vessel's arrival timestamp.
hourly_forecast_list (list): A list of dictionaries, each representing one hour
of forecast data (e.g.,
{'timestamp': '2023-01-01T10:00:00Z', 'wind_speed_knots': 10, ...})
Returns:
pd.Series: A series containing the calculated weather summary features.
Returns an empty Series on error or insufficient data.
"""
if not hourly_forecast_list:
print("Warning: Received empty hourly_forecast_list.")
return pd.Series(dtype=np.float64)
try:
# Convert list of dicts to DataFrame
forecast_df = pd.DataFrame(hourly_forecast_list)
if "timestamp" not in forecast_df.columns:
raise ValueError("Forecast data must contain a 'timestamp' column.")
# Convert timestamp string/object to datetime and set as index
forecast_df["timestamp"] = pd.to_datetime(forecast_df["timestamp"])
forecast_df = forecast_df.set_index("timestamp").sort_index()
# Select the relevant forecast period (from arrival onwards)
forecast_start_time = arrival_ts
# Use the data frame index max if it's less than theoretical horizon end
horizon_end_time = forecast_start_time + pd.Timedelta(
hours=FORECAST_HORIZON_HOURS
)
forecast_end_time = min(horizon_end_time, forecast_df.index.max())
# Slice the data for the required forecast horizon starting from arrival_ts
# Allow for slightly flexible start time if exact arrival_ts isn't in index
forecast_data_period = forecast_df.loc[forecast_df.index >= forecast_start_time]
if forecast_data_period.empty:
print(
f"Warning: No forecast data found at or after arrival_ts {forecast_start_time}"
)
return pd.Series(dtype=np.float64)
# Further slice up to the calculated end time
forecast_data_period = forecast_data_period.loc[
forecast_data_period.index <= forecast_end_time
]
if forecast_data_period.empty:
print(
f"Warning: No forecast data found within the effective horizon {forecast_start_time} to {forecast_end_time}"
)
return pd.Series(dtype=np.float64)
# --- Feature Calculation (similar to training script) ---
features = {}
required_vars = set(WEATHER_VARS_TO_SUMMARIZE)
if "state" in forecast_df.columns:
required_vars.add("state")
if not required_vars.issubset(forecast_df.columns):
missing = required_vars - set(forecast_df.columns)
raise ValueError(f"Forecast data missing required columns: {missing}")
for window_h in TIME_WINDOWS:
window_end_time = forecast_start_time + pd.Timedelta(hours=window_h)
# Slice the already selected period for the current window
window_data = forecast_data_period.loc[
forecast_data_period.index <= window_end_time
]
if window_data.empty:
continue
for var in WEATHER_VARS_TO_SUMMARIZE:
series = window_data[var]
features[f"{var}_mean_{window_h}h"] = series.mean()
features[f"{var}_max_{window_h}h"] = series.max()
features[f"{var}_min_{window_h}h"] = series.min()
# Calculate std, handle potential NaN if only one data point
std_val = series.std()
features[f"{var}_std_{window_h}h"] = (
std_val if pd.notna(std_val) else 0.0
)
# Calculate state counts if 'state' column exists
if "state" in window_data.columns:
for state in CRITICAL_STATES:
features[f"{state}_hours_{window_h}h"] = (
window_data["state"] == state
).sum()
return pd.Series(features)
except Exception as e:
print(f"Error calculating weather features: {e}")
import traceback
traceback.print_exc() # Print detailed traceback for debugging
return pd.Series(dtype=np.float64) # Return empty series on error