import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from wip.logging_config import logger
[docs]def mod_filtros(df_train, qualidade, col_target):
if qualidade.startswith("cm1"):
target_filter = df_train["corpo_moedor_especifico_1"] > 0
elif qualidade == "torque":
target_filter = df_train[col_target] >= 3000
elif qualidade == "energia_prensa":
target_filter = df_train[col_target] >= 0.15
elif qualidade == "finos":
target_filter = df_train[col_target] <= 15
elif qualidade == "compressao":
target_filter = df_train[col_target] > 200
else:
target_filter = df_train[col_target] >= 0
return df_train[target_filter]
[docs]def manual_kfold_validation(model, df_train, df_target, **kwargs):
predicted_g = []
ys = []
yhats = []
metrics = []
indexes = []
params = kwargs.get("kwparams", None)
if params is None:
err_msg = "Parameters `cv_thresholds` and `qualidade` must be specified."
logger.error(err_msg)
raise ValueError(err_msg)
cv_thresholds = params["cv_thresholds"]
qualidade = params["qualidade"]
for i in range(len(cv_thresholds) - 1):
train = df_train[df_train.index <= cv_thresholds[i]]
trainy = df_target[df_target.index <= cv_thresholds[i]]
test = df_train[
(df_train.index > cv_thresholds[i])
& (df_train.index <= cv_thresholds[i + 1])
]
testy = df_target[
(df_target.index > cv_thresholds[i])
& (df_target.index <= cv_thresholds[i + 1])
]
if (
qualidade
in [
"GRAN PR",
"SE PR",
"abrasao",
"basicidade",
"cfix",
"custo_GRAN PR",
"custo_SE PR",
"custo_abrasao",
"custo_distribuicao gran",
"distribuicao gran",
"energia_filtro",
"energia_forno",
"energia_moinho",
"energia_moinho1",
"energia_moinho2",
"energia_moinho3",
"energia_prensa",
"finos",
"gas",
"produtividade filtragem",
"relacao gran",
"temp_forno",
"temp_precipitador",
"temp_precipitador_1",
"temp_precipitador_2",
"temp_precipitador_3",
"temp_precipitador_4",
"temp_recirc",
"torque",
"umidade",
"SE PP",
"SUP_SE_PP",
"compressao",
"custo_SE PP",
"custo_compressao",
"particulados1",
"particulados2",
"particulados3",
"taxarp",
]
or qualidade.startswith("rota_disco_")
or qualidade.startswith("cm")
or qualidade.startswith("dens_moinho")
):
model.fit(train, trainy)
p_train = model.predict(train)
predicted = model.predict(test)
else:
raise ValueError(f'A qualidade "{qualidade}" não é uma opção válida')
predicted_g = np.concatenate((predicted_g, predicted))
mse = mean_squared_error(testy, predicted)
mape = np.mean(np.abs(testy - predicted) / np.abs(testy)) * 100
r2 = r2_score(testy, predicted)
r2_train = r2_score(trainy, p_train)
r = pd.DataFrame(predicted)[0].corr(pd.DataFrame(testy.values)[0])
n, p = train.shape
r2_train_adj = 1 - (1 - r2_train) * (n - 1) / (n - p - 1)
metrics.append((mse, mape, r2, r, r2_train, r2_train_adj))
indexes.extend(testy.index)
ys.extend(testy)
yhats.extend(predicted)
# -- Metrics ---------------------------------------------------------------
mse = np.mean([metric[0] for metric in metrics])
mape = np.mean([metric[1] for metric in metrics])
r2 = np.mean([metric[2] for metric in metrics])
r = np.mean([metric[3] for metric in metrics])
r2_train = np.mean([metric[4] for metric in metrics])
r2_train_adj = np.mean([metric[5] for metric in metrics])
return indexes, ys, yhats, (mse, mape, r2, r, r2_train, r2_train_adj)