Prefect 2.0 Deployment

bansalkanav · bansalkanav · commit 231c982483f9 · 2022-09-19T20:32:27.000+05:30
diff --git a/Module 5 - MLOPs/4. Orchestrate ML Pipeline/README.md b/Module 5 - MLOPs/4. Orchestrate ML Pipeline/README.md
@@ -4,7 +4,7 @@
 
 > `version_1` - Breaking the Jupyter Notebook to Python Script (Basic Code without workflow management)  
 > `version_2` - Code with Prefect Workflow - Defining the workflow and running them  
-> `version_3` - Dealing with the variables and monitoring the workflow with Prefect Cloud
+> `version_3` - Deployment and Scheduling tasks
 
 
 ### Why Prefect?
@@ -72,4 +72,30 @@ Try 'uvicorn --help' for help.
 
 Error: Got unexpected extra argument (prefect.orion.api.server:create_app)
 Orion stopped!
+```
+
+### Deployment of Prefect Flow
+
+- `work_queue_name` is used to submit the deployment to the a specific work queue.
+- You don't need to create a work queue before using the work queue. A work queue will be created if it doesn't exist.
+
+```python
+from prefect.deployments import Deployment
+from prefect.orion.schemas.schedules import IntervalSchedule
+from datetime import timedelta
+
+deployment = Deployment.build_from_flow(
+    flow=main,
+    name="model_training",
+    schedule=IntervalSchedule(interval=timedelta(minutes=5)),
+    work_queue_name="ml"
+)
+
+deployment.apply()
+```
+
+### Running an Agent
+
+```
+$ prefect agent start --work-queue "ml"
 ```
diff --git a/Module 5 - MLOPs/4. Orchestrate ML Pipeline/version_3/workflow_v3.py b/Module 5 - MLOPs/4. Orchestrate ML Pipeline/version_3/workflow_v3.py
@@ -2,79 +2,119 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
-from prefect import task, Flow, Parameter
-
-
-######################################################################
-# Now that we understand how to add task and execute the flow.
-# Lets have a look at how to deal with variables.
-# If you find yourself frequently experimenting with different values of
-# one variable, it’s ideal to turn that variable into a Parameter.
-#
-# Monitor your Workflow
-#
-# Login to prefect cloud
-# prefect auth login --key <YOUR-KEY>
-#
-# Run
-# prefect create project "Iris Project"
-# prefect agent local start
-# flow.register(project_name="Iris Project")
-######################################################################
-
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.model_selection import GridSearchCV
+import mlflow
+from prefect import task, flow
 
 @task
-def load_data(path: str) -> pd.DataFrame:
+def load_data(path: str, unwanted_cols: List) -> pd.DataFrame:
     data = pd.read_csv(path)
-    return data
-
-@task
-def remove_unwanted_cols(data: pd.DataFrame, unwanted_cols: List) -> pd.DataFrame:
     data.drop(unwanted_cols, axis=1, inplace=True)
     return data
 
+
 @task
-def get_classes(data: pd.DataFrame, target_col: str) -> List[str]:
-    return list(data[target_col].unique())
+def get_classes(target_data: pd.Series) -> List[str]:
+    return list(target_data.unique())
+
 
 @task
-def rescale_numerical_columns(data: pd.DataFrame, target_col: str) -> pd.DataFrame:
-    
-    X = data.drop([target_col], axis=1)
+def get_scaler(data: pd.DataFrame) -> Any:
     # scaling the numerical features
     scaler = StandardScaler()
+    scaler.fit(data)
+    return scaler
 
+
+@task
+def rescale_data(data: pd.DataFrame, scaler: Any) -> pd.DataFrame:    
+    # scaling the numerical features
     # column names are (annoyingly) lost after Scaling
     # (i.e. the dataframe is converted to a numpy ndarray)
-    data_rescaled = pd.DataFrame(scaler.fit_transform(X), 
-                                        columns = X.columns, 
-                                        index = X.index)
-
+    data_rescaled = pd.DataFrame(scaler.transform(data), 
+                                columns = data.columns, 
+                                index = data.index)
     return data_rescaled
 
+
 @task
-def split_data(input: pd.DataFrame, output: pd.Series, test_data_ratio: float) -> Dict[str, Any]:
-   
-    X_tr, X_te, y_tr, y_te = train_test_split(input, output, test_size=test_data_ratio, random_state=0)
-    
+def split_data(input_: pd.DataFrame, output_: pd.Series, test_data_ratio: float) -> Dict[str, Any]:
+    X_tr, X_te, y_tr, y_te = train_test_split(input_, output_, test_size=test_data_ratio, random_state=0)
     return {'X_TRAIN': X_tr, 'Y_TRAIN': y_tr, 'X_TEST': X_te, 'Y_TEST': y_te}
 
-# Defining the complete Flow
-with Flow('data-engineer') as flow:
+
+@task
+def find_best_model(X_train: pd.DataFrame, y_train: pd.Series, estimator: Any, parameters: List) -> Any:
+    # Enabling automatic MLflow logging for scikit-learn runs
+    mlflow.sklearn.autolog(max_tuning_runs=None)
+
+    with mlflow.start_run():        
+        clf = GridSearchCV(
+            estimator=estimator, 
+            param_grid=parameters, 
+            scoring='accuracy',
+            cv=5,
+            return_train_score=True,
+            verbose=1
+        )
+        clf.fit(X_train, y_train)
+        
+        # Disabling autologging
+        mlflow.sklearn.autolog(disable=True)
+        
+        return clf
+
+
+# Workflow
+@flow
+def main(path: str='./data/iris.csv', target: str='Species', unwanted_cols: List[str]=['Id'], test_size: float=0.2):
+    
+    mlflow.set_tracking_uri("sqlite:///mlflow.db")
+    mlflow.set_experiment("Iris Species Prediction")
+    
     # Define Parameters
-    target_col = 'Species'
-    unwanted_cols = ['Id']
-    test_data_ratio = Parameter("test_data_ratio", default=0.2)
-
-    # Run Functions
-    data = load_data(path='data/iris.csv')
-
-    # Workflow
-    remove_unwanted_cols(data=data, unwanted_cols=unwanted_cols)
-    numerical_data = rescale_numerical_columns(data=data, target_col=target_col)
-    classes = get_classes(data=data, target_col=target_col)
-    train_test_dict = split_data(input=numerical_data, output=data[target_col], test_data_ratio=test_data_ratio)
+    DATA_PATH = path
+    TARGET_COL = target
+    UNWANTED_COLS = unwanted_cols
+    TEST_DATA_RATIO = test_size
+
+    # Load the Data
+    dataframe = load_data(path=DATA_PATH, unwanted_cols=UNWANTED_COLS)
+
+    # Identify Target Variable
+    target_data = dataframe[TARGET_COL]
+    input_data = dataframe.drop([TARGET_COL], axis=1)
+
+    # Get Unique Classes
+    classes = get_classes(target_data=target_data)
+    
+    # Split the Data into Train and Test
+    train_test_dict = split_data(input_=input_data, output_=target_data, test_data_ratio=TEST_DATA_RATIO)
+    
+    # Rescaling Train and Test Data
+    scaler = get_scaler(train_test_dict['X_TRAIN'])
+    train_test_dict['X_TRAIN'] = rescale_data(data=train_test_dict['X_TRAIN'], scaler=scaler)
+    train_test_dict['X_TEST'] = rescale_data(data=train_test_dict['X_TEST'], scaler=scaler)
     
-# flow.run(parameters={'test_data_ratio': 0.3})
+    # Model Training
+    ESTIMATOR = KNeighborsClassifier()
+    HYPERPARAMETERS = [{'n_neighbors':[i for i in range(1, 51)], 'p':[1, 2]}]
+    classifier = find_best_model(train_test_dict['X_TRAIN'], train_test_dict['Y_TRAIN'], ESTIMATOR, HYPERPARAMETERS)
+    print(classifier.best_params_)
+    print(classifier.score(train_test_dict['X_TEST'], train_test_dict['Y_TEST']))
+    
+    
+# Deploy the main function
+from prefect.deployments import Deployment
+from prefect.orion.schemas.schedules import IntervalSchedule
+from datetime import timedelta
+
+deployment = Deployment.build_from_flow(
+    flow=main,
+    name="model_training",
+    schedule=IntervalSchedule(interval=timedelta(minutes=5)),
+    work_queue_name="ml"
+)
 
-flow.register(project_name="demo")
+deployment.apply()