BOEM-webapp/prepare.py at main · weecology/BOEM-webapp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
from app.utils import comet_utils
from app.utils.annotations import load_annotations, apply_annotations, ensure_human_labeled
from extract_coordinates import extract_flight_coordinates, generate_metadata
import pandas as pd

from pathlib import Path
import json
import os
import shutil
import sys

IMAGES_DIR = Path("app/data/images")
MANIFEST_PATH = IMAGES_DIR / "crop_manifest.json"


def normalize_predictions_scores(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize scores to 0-1 scale and add human_labeled column.

    - Adds human_labeled: True when set in train/validation/review, else False
    - For human_labeled rows (or score > 1): sets score = 1.0
    """
    df = df.copy()
    if 'set' not in df.columns:
        df['human_labeled'] = False
        return df
    df['human_labeled'] = df['set'].isin(['train', 'validation', 'review'])
    # Normalize scores: human-labeled and any score > 1 become 1.0
    mask = df['human_labeled'] | (df['score'] > 1)
    if 'score' in df.columns:
        df.loc[mask, 'score'] = 1.0
    return df


def build_predictions_indices(effective_df: pd.DataFrame) -> dict:
    """Build index dict: species_list, flight_list, by_species, by_flight."""
    effective_df = effective_df.copy()
    if 'crop_image_id' in effective_df.columns:
        effective_df['crop_image_id'] = effective_df['crop_image_id'].astype(str)
    species_list = sorted(effective_df['cropmodel_label'].dropna().unique().tolist())
    flight_list = (
        sorted(effective_df['flight_name'].dropna().unique().tolist())
        if 'flight_name' in effective_df.columns
        else []
    )
    by_species = (
        effective_df.groupby('cropmodel_label')['crop_image_id']
        .apply(lambda x: x.astype(str).tolist())
        .to_dict()
    )
    by_flight = {}
    if 'flight_name' in effective_df.columns:
        by_flight = (
            effective_df.groupby('flight_name')['crop_image_id']
            .apply(lambda x: x.astype(str).tolist())
            .to_dict()
        )
    return {
        'species_list': species_list,
        'flight_list': flight_list,
        'by_species': by_species,
        'by_flight': by_flight,
    }


def flight_basename(flight_name: str) -> str:
    """Strip the first underscore-delimited prefix (e.g. 'JPG') to get the
    metadata basename that matches .aflight / .csv filenames."""
    parts = flight_name.split("_")
    if len(parts) > 1:
        return "_".join(parts[1:])
    return flight_name


def load_manifest() -> dict:
    """Load the crop download manifest (previously downloaded experiments)."""
    if MANIFEST_PATH.exists():
        with open(MANIFEST_PATH) as f:
            return json.load(f)
    return {}


def save_manifest(experiments: list):
    """Save the crop download manifest."""
    IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    with open(MANIFEST_PATH, "w") as f:
        json.dump({"experiments": sorted(experiments)}, f, indent=2)


def clear_crop_images():
    """Delete all crop images (but not the manifest) from the images directory."""
    if not IMAGES_DIR.exists():
        return
    for item in IMAGES_DIR.iterdir():
        if item.name == MANIFEST_PATH.name:
            continue
        if item.is_file():
            item.unlink()
        elif item.is_dir():
            shutil.rmtree(item)


if __name__ == '__main__':
    # --- 1. Download newest metrics and predictions from Comet ---
    print("\n[1/10] Downloading metrics and predictions from Comet...")
    comet_utils.get_all_comet_metrics()

    # --- 2. Load latest predictions (already Gulf-only from Comet) ---
    print("[2/10] Loading latest predictions...")
    latest_predictions = pd.read_csv(
        "app/data/most_recent_all_flight_predictions.csv")
    if latest_predictions.empty:
        print("WARNING: No Gulf flights found. Check gulf_flights.txt and Comet prediction data.")
        print("\nDone.\n")
        sys.exit(1)

    # --- 3. Extract metadata only when predictions don't already have Lat/Lon ---
    has_coords = (
        ('Lat' in latest_predictions.columns and 'Lon' in latest_predictions.columns)
        or ('lat' in latest_predictions.columns and 'long' in latest_predictions.columns)
    )
    if not has_coords:
        print("[3/10] Extracting flight coordinates (no Lat/Lon in predictions)...")
        for fn in latest_predictions['flight_name'].unique():
            fb = flight_basename(fn)
            if not os.path.exists(f"app/data/metadata/{fb}.csv"):
                extract_flight_coordinates(fb)
            else:
                print(f"Metadata already exists for {fn}")
        generate_metadata()
    else:
        print("[3/10] Predictions already have coordinates — skipping metadata extraction.")

    # --- 4. Normalize scores and add human_labeled ---
    print("[4/10] Normalizing scores and human_labeled...")
    latest_predictions = normalize_predictions_scores(latest_predictions)
    latest_predictions.to_csv(
        "app/data/most_recent_all_flight_predictions.csv", index=False)

    # --- 5. Apply annotations and build indices ---
    print("[5/10] Applying annotations and building indices...")
    annotations_df = load_annotations("app/data/annotations.csv")
    effective_predictions = apply_annotations(
        latest_predictions, annotations_df,
        id_col="crop_image_id", label_col="cropmodel_label", set_col="set"
    )
    effective_predictions = ensure_human_labeled(effective_predictions,
                                                 set_col="set")
    fp_mask = effective_predictions["cropmodel_label"] == "FalsePositive"
    fp_count = fp_mask.sum()
    if fp_count:
        print(f"Dropping {fp_count} FalsePositive rows from effective "
              "predictions")
    effective_predictions = effective_predictions[~fp_mask]
    effective_path = Path(
        "app/data/most_recent_all_flight_predictions_effective.csv")
    effective_predictions.to_csv(effective_path, index=False)
    indices = build_predictions_indices(effective_predictions)
    indices_path = Path("app/data/predictions_indices.json")
    with open(indices_path, "w") as f:
        json.dump(indices, f, indent=2)
    print(f"      Wrote {indices_path} and {effective_path}")

    # --- 6. Normalize predictions.csv (full history) ---
    print("[6/10] Normalizing full predictions history...")
    predictions_path = Path("app/data/predictions.csv")
    if predictions_path.exists():
        predictions_df = pd.read_csv(predictions_path)
        predictions_df = normalize_predictions_scores(predictions_df)
        predictions_df.to_csv(predictions_path, index=False)

    # --- 7. Create shapefiles ---
    print("[7/10] Creating shapefiles...")
    comet_utils.create_shapefiles(
        latest_predictions,
        metadata=None if has_coords else "app/data/metadata.csv",
    )

    # --- 8. Smart crop download ---
    print("[8/10] Checking crop download...")
    current_experiments = sorted(
        latest_predictions['experiment'].unique().tolist())
    manifest = load_manifest()
    old_experiments = manifest.get("experiments", [])

    fp_image_ids = set(
        annotations_df.loc[
            annotations_df["new_label"] == "FalsePositive", "image_id"
        ].astype(str)
    )
    print(f"Will skip {len(fp_image_ids)} FalsePositive image(s) "
          "during download")

    if current_experiments == old_experiments:
        print("Crop manifest unchanged — skipping download")
        removed = 0
        for fp_id in fp_image_ids:
            fp_path = IMAGES_DIR / fp_id
            if fp_path.exists():
                fp_path.unlink()
                removed += 1
        if removed:
            print(f"Removed {removed} existing FalsePositive image(s) "
                  "from disk")
    else:
        print(f"Experiment set changed "
              f"({len(old_experiments)} → {len(current_experiments)}). "
              "Re-downloading crops...")
        clear_crop_images()

        for experiment_name in current_experiments:
            comet_utils.download_images(save_dir="app/data/images",
                                        experiment_name=experiment_name)
            exp_dir = IMAGES_DIR / experiment_name
            if exp_dir.exists():
                for image in os.listdir(str(exp_dir)):
                    if image in fp_image_ids:
                        os.remove(str(exp_dir / image))
                        continue
                    shutil.move(str(exp_dir / image),
                                str(IMAGES_DIR / image))
                shutil.rmtree(str(exp_dir))

        removed = 0
        for fp_id in fp_image_ids:
            fp_path = IMAGES_DIR / fp_id
            if fp_path.exists():
                fp_path.unlink()
                removed += 1
        if removed:
            print(f"Removed {removed} existing FalsePositive image(s) "
                  "from disk")

        save_manifest(current_experiments)
        print(f"      Downloaded crops for {len(current_experiments)} experiment(s)")

    # --- 9. Flythrough videos (one per flight from its latest experiment) ---
    print("[9/10] Downloading flythrough videos...")
    flight_to_experiment = (
        latest_predictions.dropna(subset=["flight_name", "experiment"])
        .drop_duplicates("flight_name")
        .set_index("flight_name")["experiment"]
        .to_dict()
    )
    comet_utils.download_flythrough_videos(flight_to_experiment, save_dir="app/data/videos")

    # --- 10. Flight reports (transect map, report PDF, shapefiles, etc.) ---
    print("[10/10] Downloading flight reports...")
    comet_utils.download_flight_reports(flight_to_experiment, save_dir="app/data/reports")

    print("\nDone.\n")