Skip to content

As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.

You will be working with data stored in london_weather.csv, which contains the following columns:

  • date - recorded date of measurement - (int)
  • cloud_cover - cloud cover measurement in oktas - (float)
  • sunshine - sunshine measurement in hours (hrs) - (float)
  • global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
  • max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
  • mean_temp - mean temperature in degrees Celsius (°C) - (float)
  • min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
  • precipitation - precipitation measurement in millimeters (mm) - (float)
  • pressure - pressure measurement in Pascals (Pa) - (float)
  • snow_depth - snow depth measurement in centimeters (cm) - (float)
# Run this cell to import the modules you require
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Read in the data
weather = pd.read_csv("london_weather.csv")

# Start coding here
# Use as many cells as you like
weather.info()
weather.head()
# check if there are missing values
weather.isnull().values.any()
# Convert the `date` column into datetime format
weather["date"] = pd.to_datetime(weather['date'].astype(str), format='%Y%m%d')
weather.info()
weather.head()
# create new "month" and "year" columns so that we can visualize how the dataset's features vary over time.
weather["year"] = weather["date"].dt.year
weather["month"] = weather["date"].dt.month
weather.head()
# visualize "mean_temp" versus "year" or "month" using the sns.lineplot() function.
# we can see that each year the temperature tends to increase gradually.
sns.lineplot(x='month', y='mean_temp', data=weather)
plt.show()

sns.lineplot(x='year', y='mean_temp', data=weather)
plt.show()

sns.heatmap(weather.corr(), annot=True, cmap='coolwarm')
plt.show()
# Looks like global_radiation has the highest correlation to the mean_temp
# but we will also extract the "sunshine" feature
feature_selection = ["sunshine", "min_temp", "max_temp", "global_radiation"]
X = weather[feature_selection]
y = weather["mean_temp"]

mask = ~y.isna()
X = X.loc[mask]
y = y.loc[mask]
print(X.shape)
# Dictionaries for model names and parameters
regression_models = {
    "LinearRegression": {
        "numeric_pipeline": Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())]),
        "model": LinearRegression()
    },
    "DecisionTreeRegressor": {
        "numeric_pipeline": Pipeline([
            ("imputer", SimpleImputer(strategy="mean"))
        ]),
        "model": DecisionTreeRegressor(
            max_depth=21,
            min_samples_leaf=13,
            random_state=34
        )
    },
    "RandomForestRegressor": {
        "numeric_pipeline": Pipeline([
            ("imputer", SimpleImputer(strategy="mean"))
        ]),
        "model": RandomForestRegressor(
            n_estimators=233,      # number of trees
            max_depth=15,        # let trees expand fully (can tune)
            min_samples_leaf=13,    # regularization: minimum samples per leaf
            n_jobs=-1,             # use all CPU cores
            random_state=34
        )
    }
}
def set_max_depth(model, max_depth=None):
    model.set_params().max_depth = max_depth
    return model
for regression_type in ["LinearRegression", "DecisionTreeRegressor", "RandomForestRegressor"]:
    preprocessor = ColumnTransformer(
        transformers=[("num", regression_models[regression_type]["numeric_pipeline"], feature_selection)]
    )

    # For predicticing weather we will use time-aware split (first 80% train, last 20% test)
    n = len(weather)
    split_idx = int(n * 0.80)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    def run_model(regression_type, full_model_pipeline, max_depth=None):
        # Start the MLFloe
        with mlflow.start_run():
            # Fit the model
            full_model_pipeline.fit(X_train, y_train)
            
            # Evaluate
            y_pred = full_model_pipeline.predict(X_test)
            if regression_type != "LinearRegression":
                print(f"Result for {regression_type} model with max_depth={max_depth}:")
            else:
                print(f"Result for {regression_type} model:")
            rmse = round(mean_squared_error(y_test, y_pred, squared=False), 4)
            mae = round(mean_absolute_error(y_test, y_pred), 4)
            r2 = round(r2_score(y_test, y_pred), 4)

            mlflow.sklearn.log_model(full_model_pipeline, regression_type)
            mlflow.log_param("max_depth", max_depth)
            mlflow.log_metric("rmse_tree", rmse)
            
            print("RMSE: ", rmse)
            print("MAE : ", mae)
            print("R²  : ", r2)
            print()
        
        
    if regression_type != "LinearRegression":
        for depth in range(1,22):
            reg_model = regression_models[regression_type]["model"]
            reg_model = set_max_depth(reg_model, max_depth=depth)
            # Full model pipeline
            full_model_pipeline = Pipeline(steps=[
                ("preprocessor", preprocessor),
                ("regressor", reg_model)
            ])
            run_model(regression_type, full_model_pipeline, max_depth=depth)
    else:
        reg_model = regression_models[regression_type]["model"]
        full_model_pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("regressor", reg_model)
        ])
        run_model(regression_type, full_model_pipeline)
# Query the Experiment
from mlflow import MlflowClient

experiment = mlflow.get_experiment_by_name("Default")
print("Experiment ID:", experiment.experiment_id)

client = MlflowClient()

# Search all runs in the Default experiment
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string="",  # leave empty for all
    order_by=["metrics.rmse_tree"],  # sort by metric
    max_results=100,
)

for run in runs:
    print(
        f"Run ID: {run.info.run_id}, "
        f"max_depth={run.data.params.get('max_depth')}, "
        f"rmse={run.data.metrics.get('rmse_tree')}"
    )
experiment_results = mlflow.search_runs(experiment_ids=["0"])
print(experiment_results.head())
print(experiment_results.info())