voorbeeld resultaat Zuid Willemsvaart Chlorophyll
voorbeeld resultaat Zuid-Willemsvaart Chlorophyll
Code
library(tidyverse)
library(leaflet)
library(sf)
library(plotly)
# library(anomalize)
library(slider)
library(anomaly)
# read custom functions
source('../scripts/test_get_tsfeatures.R')
Data inlezen
Code
<- read_csv2("../data/raw/locatie_mapping.csv")
locs <- read_csv2("../data/raw/meet_range_sensoren.csv") %>%
sensor_lims filter(path_length %in% c(1, NA))
<- readxl::read_excel('../data/raw/validation_master_v1.xlsx', sheet = 'realistic_range')
par_ranges <-
df_zw ::read_parquet("../data/modified/sensor_data_tot.parquet", as_data_frame = FALSE) %>%
arrowfilter(project == "Project Zuid-Willemsvaart") %>%
collect() %>%
mutate(
sensor = case_when(
== 'C4E' ~ 'Ponsel C4E',
sensor == 'OPTOD' ~ 'Ponsel OPTOD',
sensor == 'PHEHT' ~ 'Ponsel PHEHT',
sensor TRUE ~ sensor
)
)# df_extra <- read_csv("../data/intermediate/data_voorbereidingworkshop_extra.csv") %>%
# mutate(
# sensor = case_when(
# sensor == 'C4E' ~ 'Ponsel C4E',
# sensor == 'PHEHT' ~ 'Ponsel PHEHT',
# TRUE ~ sensor
# ),
# datetime = ymd_hms(datetime, tz = 'CET')
# )
<- filter(df_zw, parameter_combi == 'Chlorophyll ug/l')
df
<- semi_join(sensor_lims, df)
s_lims <- c(0.0001, 400)
lims
# oh <- read_csv('../data/raw/data_oh_voor_workshop.csv') %>%
# mutate(datum = ymd_hms(datum, tz = 'CET'))
#
# oh_saqc <- oh %>% select(datum_start = datum) %>%
# mutate(maint = datum_start + hours(3))
#
# df_lab_cor <- read_csv('../data/raw/lab_cor_data_workshop.csv') %>%
# mutate(datetime_compare = ymd_hms(datetime_compare, tz = 'CET'))
criteria tabellen
Code
%>%
sensor_lims ::kable(caption = glue::glue("Sensor ranges, dit geval: niet aanwezig")) knitr
sensor | parameter_code | hoedanigheid_code | eenheid_code | ondergrens | bovengrens | path_length |
---|---|---|---|---|---|---|
Ponsel PHEHT | pH | NVT | DIMSLS | 0 | 14 | NA |
Ponsel PHEHT | T | NVT | oC | -5 | 50 | NA |
Ponsel C4E | T | NVT | oC | -5 | 50 | NA |
Ponsel C4E | GELDHD | NVT | uS/cm | 0 | 2000 | NA |
Ponsel OPTOD | O2 | NVT | mg/l | 0 | 20 | NA |
Ponsel OPTOD | O2 | NVT | % | 0 | 200 | NA |
Ponsel OPTOD | T | NVT | oC | -5 | 50 | NA |
Eureka Manta +35 | pH | NVT | DIMSLS | 0 | 14 | NA |
Eureka Manta +35 | GELDHD | NVT | uS/cm | 0 | 5000 | NA |
Eureka Manta +35 | O2 | NVT | mg/l | 0 | 50 | NA |
Eureka Manta +35 | O2 | NVT | % | 0 | 500 | NA |
Eureka Manta +35 | T | NVT | oC | -5 | 50 | NA |
YSI EXO 3 | pH | NVT | DIMSLS | 0 | 14 | NA |
YSI EXO 3 | GELDHD | NVT | mS/cm | 0 | 100 | NA |
YSI EXO 3 | GELDHD | NVT | uS/cm | 0 | 100000 | NA |
YSI EXO 3 | O2 | NVT | mg/l | 0 | 50 | NA |
YSI EXO 3 | O2 | NVT | % | 0 | 500 | NA |
YSI EXO 3 | T | NVT | oC | -5 | 50 | NA |
Trios NICO | NO3 | nf | mg/l | 0 | 266 | 1 |
Trios NICO | NO3 | Nnf | mg/l | 0 | 60 | 1 |
OTT EcoN | NO3 | nf | mg/l | 0 | 266 | 1 |
OTT EcoN | NO3 | Nnf | mg/l | 0 | 60 | 1 |
YSI EXO 2 | pH | NVT | DIMSLS | 0 | 14 | NA |
YSI EXO 2 | GELDHD | NVT | uS/cm | 0 | 100000 | NA |
YSI EXO 2 | O2 | NVT | mg/l | 0 | 50 | NA |
YSI EXO 2 | O2 | NVT | % | 0 | 500 | NA |
YSI EXO 2 | T | NVT | oC | -5 | 50 | NA |
Code
%>% select(parameter_id, jaar_min, jaar_max) %>% knitr::kable(caption = "Realistische ranges per parameter, dit geval Chlorofyl-a_p_ug/l") par_ranges
parameter_id | jaar_min | jaar_max |
---|---|---|
Ammonium_mg/l | 0.00 | 1000.0 |
Ammonium_N_mg/l_AW | 0.00 | 1000.0 |
Ammonium_N_mg/l_OW | 0.00 | 1000.0 |
Chlorofyl-a_blauwalg_ug/l | 0.00 | 5000.0 |
Chlorofyl-a_groenalg_ug/l | 0.00 | 5000.0 |
Doorzicht_dm | 0.00 | 617.0 |
Geleidendheid_25oC_uS/cm | 100.00 | 1000.0 |
Geleidendheid_25oC_mS/m | 10.00 | 100.0 |
Lichtintensiteit_1/m | 0.00 | 28000.0 |
Lichtintensiteit_ref_A_212_nm_SNR | 0.00 | 28000.0 |
Lichtintensiteit_ref_B_254_nm_SNR | 0.00 | 28000.0 |
Lichtintensiteit_ref_C_360_nm_SNR | 0.00 | 28000.0 |
Lichtintensiteit_ref_D_reference_diode_SNR | 0.00 | 28000.0 |
Nitraat_mg/l | 0.00 | 50.0 |
Nitraat_N_mg/l | 0.00 | 50.0 |
pH | 2.00 | 13.0 |
Ptot_P_mg/l | 0.00 | 4.0 |
Redoxpotentiaal_mV | -400.00 | 800.0 |
Relative_fluoresence_units_blauwalg | 0.00 | 100.0 |
Relative_fluoresence_units_groenalg | 0.00 | 100.0 |
SAC_254_nm | 0.00 | 1500.0 |
Saliniteit_PSU | 0.05 | 0.5 |
SQI | 0.00 | 1.0 |
Temperatuur_oC_OW | NA | NA |
Totaal_opgeloste_bestanddelen_mg/l | 0.00 | 500.0 |
Troebelheid_NTU | 0.00 | 10000.0 |
TRP_mg/l | 0.00 | 1.0 |
Zuurstof_verzadiging_% | 0.00 | 300.0 |
Zuurstof_opgelost_mg/l | NA | NA |
Chlorofyl-a_blauwalg_ug/l | 0.00 | 400.0 |
Chlorofyl-a_blauwalg_ug/l | 0.00 | 400.0 |
Chlorofyl-a_groenalg_ug/l | 0.00 | 400.0 |
Chlorofyl-a_groenalg_ug/l | 0.00 | 400.0 |
Geleidendheid_25oC_mS/m | 25.00 | 100.0 |
Geleidendheid_25oC_mS/m | 25.00 | 250.0 |
Geleidendheid_25oC_mS/m | 30.00 | 260.0 |
Geleidendheid_25oC_mS/m | 43.00 | 54.0 |
Geleidendheid_25oC_mS/m | 20.00 | 83.0 |
Temperatuur_oC_OW | 1.00 | 28.0 |
Temperatuur_oC_OW | 6.00 | 28.0 |
Temperatuur_oC_OW | 9.00 | 30.0 |
Temperatuur_oC_OW | 4.00 | 28.0 |
Temperatuur_oC_OW | 4.00 | 28.0 |
pH | 6.00 | 11.0 |
pH | 6.00 | 11.0 |
pH | 6.00 | 9.0 |
pH | 7.00 | 11.0 |
pH | 7.00 | 11.0 |
Zuurstof_verzadiging_% | 0.00 | 180.0 |
Zuurstof_verzadiging_% | 0.00 | 130.0 |
Zuurstof_verzadiging_% | 0.00 | 100.0 |
Zuurstof_verzadiging_% | 0.00 | 250.0 |
Zuurstof_verzadiging_% | 0.00 | 250.0 |
Zuurstof_opgelost_mg/l | NA | NA |
Zuurstof_opgelost_mg/l | NA | NA |
Zuurstof_opgelost_mg/l | NA | NA |
Zuurstof_opgelost_mg/l | NA | NA |
Zuurstof_opgelost_mg/l | NA | NA |
Visualiseer originele data serie
Code
<- ggplot(df, aes(x = datetime, y = waarde)) +
p geom_point(size = 0.1, shape = 1) +
geom_line() +
labs(x = NULL, y = glue::glue("{s_lims$parameter_code} [{s_lims$eenheid_code}]"))
#ggplotly(p)
p
Check op onrealistische waarden
Code
<- c(0.00001, 400)
lims_real <- df %>% mutate(waarde_realistisch = between(waarde, left = lims_real[1], right = lims_real[2]))
df
ggplot(df, aes(x = datetime, y = waarde)) +
geom_line() +
geom_point(aes(col = waarde_realistisch, shape = waarde_realistisch)) +
labs(x = NULL, y = "NO3-N [mg/l]") +
scale_color_manual(values = c("red", "black")) +
scale_shape_manual(values = c(1, NA))
Check op flatlines
Code
<- get_flatline(dataset = df, value_col = "waarde", threshold = 0, n_measurements = 10)
df
ggplot(filter(df, !is.na(flatline)), aes(x = datetime, y = waarde)) +
geom_line() +
geom_point(aes(col = flatline, shape = flatline)) +
labs(x = NULL, y = "NO3-N [mg/l]") +
scale_color_manual(values = c("black", "red")) +
scale_shape_manual(values = c(NA, 1))
Check overige outliers
Code
<- filter(df, flatline == FALSE, waarde_realistisch == TRUE) %>%
df_trans distinct(datetime, waarde) %>%
group_by(datetime) %>%
summarise(waarde = mean(waarde, na.rm = TRUE)) %>%
ungroup() %>%
transform_data_uv(time_col = 'datetime')
$features <- slide(df_trans, ~get_tsfeatures(.x$waarde), .before = 9, .complete = FALSE)
df_trans<- df_trans %>% unnest_wider(features)
df_trans
<- is.na(df_trans) %>% colMeans()
col_keep <- names(col_keep[col_keep < 1])
col_keep <- col_keep[!col_keep %in% c('datetime', 'waarde', 'time')]
col_keep <- df_trans[, col_keep]
df_testout
<- isotree::isolation.forest(data = df_testout, ntrees = 500)
out_forest
$datetime <- df_trans$datetime
df_testout$waarde <- df_trans$waarde
df_testout$iforest_score <- predict(out_forest, df_testout)
df_testout<- quantile(df_testout$iforest_score, probs = 0.99, na.rm = TRUE, names = FALSE)
iforest_th $iforest_type <- df_testout$iforest_score >= iforest_th
df_testout
#plot
ggplot(df_testout, aes(x = datetime, y = waarde)) +
geom_line() +
geom_point(aes(col = iforest_type, shape = iforest_type)) +
# labs(title = par) +
scale_color_manual(values = c("black", "red")) +
scale_shape_manual(values = c(NA, 1))