voorbeeld resultaat Dinter O2
Code
library(tidyverse)
library(leaflet)
library(sf)
library(plotly)
# library(anomalize)
library(slider)
library(anomaly)
# read custom functions
source('../scripts/test_get_tsfeatures.R')
Data inlezen
Code
locs <- read_csv2("../data/raw/locatie_mapping.csv")
sensor_lims <- read_csv2("../data/raw/meet_range_sensoren.csv") %>%
filter(path_length %in% c(1, NA))
par_ranges <- readxl::read_excel('../data/raw/validation_master_v1.xlsx', sheet = 'realistic_range')
df_dt <-
arrow::read_parquet("../data/modified/sensor_data_tot.parquet", as_data_frame = FALSE) %>%
filter(project == "Project RWZI Dinther") %>%
collect() %>%
mutate(
sensor = case_when(
sensor == 'C4E' ~ 'Ponsel C4E',
sensor == 'OPTOD' ~ 'Ponsel OPTOD',
sensor == 'PHEHT' ~ 'Ponsel PHEHT',
TRUE ~ sensor
)
)
# df_extra <- read_csv("../data/intermediate/data_voorbereidingworkshop_extra.csv") %>%
# mutate(
# sensor = case_when(
# sensor == 'C4E' ~ 'Ponsel C4E',
# sensor == 'PHEHT' ~ 'Ponsel PHEHT',
# TRUE ~ sensor
# ),
# datetime = ymd_hms(datetime, tz = 'CET')
# )
df <- filter(df_dt, parameter_combi == 'aEFFLDI003 - 2620015709 WA1111-EU-V2 / Dissolved Oxygen (OPTOD)(mg/L)')
s_lims <- semi_join(sensor_lims, df)
lims <- c(s_lims$ondergrens, s_lims$bovengrens)
# oh <- read_csv('../data/raw/data_oh_voor_workshop.csv') %>%
# mutate(datum = ymd_hms(datum, tz = 'CET'))
#
# oh_saqc <- oh %>% select(datum_start = datum) %>%
# mutate(maint = datum_start + hours(3))
#
# df_lab_cor <- read_csv('../data/raw/lab_cor_data_workshop.csv') %>%
# mutate(datetime_compare = ymd_hms(datetime_compare, tz = 'CET'))
criteria tabellen
Code
sensor_lims %>%
knitr::kable(caption = glue::glue("Sensor ranges, dit geval: {s_lims$sensor} {s_lims$parameter_code} {s_lims$hoedanigheid_code}"))
Sensor ranges, dit geval: Ponsel OPTOD O2 NVT
Ponsel PHEHT |
pH |
NVT |
DIMSLS |
0 |
14 |
NA |
Ponsel PHEHT |
T |
NVT |
oC |
-5 |
50 |
NA |
Ponsel C4E |
T |
NVT |
oC |
-5 |
50 |
NA |
Ponsel C4E |
GELDHD |
NVT |
uS/cm |
0 |
2000 |
NA |
Ponsel OPTOD |
O2 |
NVT |
mg/l |
0 |
20 |
NA |
Ponsel OPTOD |
O2 |
NVT |
% |
0 |
200 |
NA |
Ponsel OPTOD |
T |
NVT |
oC |
-5 |
50 |
NA |
Eureka Manta +35 |
pH |
NVT |
DIMSLS |
0 |
14 |
NA |
Eureka Manta +35 |
GELDHD |
NVT |
uS/cm |
0 |
5000 |
NA |
Eureka Manta +35 |
O2 |
NVT |
mg/l |
0 |
50 |
NA |
Eureka Manta +35 |
O2 |
NVT |
% |
0 |
500 |
NA |
Eureka Manta +35 |
T |
NVT |
oC |
-5 |
50 |
NA |
YSI EXO 3 |
pH |
NVT |
DIMSLS |
0 |
14 |
NA |
YSI EXO 3 |
GELDHD |
NVT |
mS/cm |
0 |
100 |
NA |
YSI EXO 3 |
GELDHD |
NVT |
uS/cm |
0 |
100000 |
NA |
YSI EXO 3 |
O2 |
NVT |
mg/l |
0 |
50 |
NA |
YSI EXO 3 |
O2 |
NVT |
% |
0 |
500 |
NA |
YSI EXO 3 |
T |
NVT |
oC |
-5 |
50 |
NA |
Trios NICO |
NO3 |
nf |
mg/l |
0 |
266 |
1 |
Trios NICO |
NO3 |
Nnf |
mg/l |
0 |
60 |
1 |
OTT EcoN |
NO3 |
nf |
mg/l |
0 |
266 |
1 |
OTT EcoN |
NO3 |
Nnf |
mg/l |
0 |
60 |
1 |
YSI EXO 2 |
pH |
NVT |
DIMSLS |
0 |
14 |
NA |
YSI EXO 2 |
GELDHD |
NVT |
uS/cm |
0 |
100000 |
NA |
YSI EXO 2 |
O2 |
NVT |
mg/l |
0 |
50 |
NA |
YSI EXO 2 |
O2 |
NVT |
% |
0 |
500 |
NA |
YSI EXO 2 |
T |
NVT |
oC |
-5 |
50 |
NA |
Code
par_ranges %>% select(parameter_id, jaar_min, jaar_max) %>% knitr::kable(caption = "Realistische ranges per parameter, dit geval Zuurstof_opgelost_mg/l")
Realistische ranges per parameter, dit geval Zuurstof_opgelost_mg/l
Ammonium_mg/l |
0.00 |
1000.0 |
Ammonium_N_mg/l_AW |
0.00 |
1000.0 |
Ammonium_N_mg/l_OW |
0.00 |
1000.0 |
Chlorofyl-a_blauwalg_ug/l |
0.00 |
5000.0 |
Chlorofyl-a_groenalg_ug/l |
0.00 |
5000.0 |
Doorzicht_dm |
0.00 |
617.0 |
Geleidendheid_25oC_uS/cm |
100.00 |
1000.0 |
Geleidendheid_25oC_mS/m |
10.00 |
100.0 |
Lichtintensiteit_1/m |
0.00 |
28000.0 |
Lichtintensiteit_ref_A_212_nm_SNR |
0.00 |
28000.0 |
Lichtintensiteit_ref_B_254_nm_SNR |
0.00 |
28000.0 |
Lichtintensiteit_ref_C_360_nm_SNR |
0.00 |
28000.0 |
Lichtintensiteit_ref_D_reference_diode_SNR |
0.00 |
28000.0 |
Nitraat_mg/l |
0.00 |
50.0 |
Nitraat_N_mg/l |
0.00 |
50.0 |
pH |
2.00 |
13.0 |
Ptot_P_mg/l |
0.00 |
4.0 |
Redoxpotentiaal_mV |
-400.00 |
800.0 |
Relative_fluoresence_units_blauwalg |
0.00 |
100.0 |
Relative_fluoresence_units_groenalg |
0.00 |
100.0 |
SAC_254_nm |
0.00 |
1500.0 |
Saliniteit_PSU |
0.05 |
0.5 |
SQI |
0.00 |
1.0 |
Temperatuur_oC_OW |
NA |
NA |
Totaal_opgeloste_bestanddelen_mg/l |
0.00 |
500.0 |
Troebelheid_NTU |
0.00 |
10000.0 |
TRP_mg/l |
0.00 |
1.0 |
Zuurstof_verzadiging_% |
0.00 |
300.0 |
Zuurstof_opgelost_mg/l |
NA |
NA |
Chlorofyl-a_blauwalg_ug/l |
0.00 |
400.0 |
Chlorofyl-a_blauwalg_ug/l |
0.00 |
400.0 |
Chlorofyl-a_groenalg_ug/l |
0.00 |
400.0 |
Chlorofyl-a_groenalg_ug/l |
0.00 |
400.0 |
Geleidendheid_25oC_mS/m |
25.00 |
100.0 |
Geleidendheid_25oC_mS/m |
25.00 |
250.0 |
Geleidendheid_25oC_mS/m |
30.00 |
260.0 |
Geleidendheid_25oC_mS/m |
43.00 |
54.0 |
Geleidendheid_25oC_mS/m |
20.00 |
83.0 |
Temperatuur_oC_OW |
1.00 |
28.0 |
Temperatuur_oC_OW |
6.00 |
28.0 |
Temperatuur_oC_OW |
9.00 |
30.0 |
Temperatuur_oC_OW |
4.00 |
28.0 |
Temperatuur_oC_OW |
4.00 |
28.0 |
pH |
6.00 |
11.0 |
pH |
6.00 |
11.0 |
pH |
6.00 |
9.0 |
pH |
7.00 |
11.0 |
pH |
7.00 |
11.0 |
Zuurstof_verzadiging_% |
0.00 |
180.0 |
Zuurstof_verzadiging_% |
0.00 |
130.0 |
Zuurstof_verzadiging_% |
0.00 |
100.0 |
Zuurstof_verzadiging_% |
0.00 |
250.0 |
Zuurstof_verzadiging_% |
0.00 |
250.0 |
Zuurstof_opgelost_mg/l |
NA |
NA |
Zuurstof_opgelost_mg/l |
NA |
NA |
Zuurstof_opgelost_mg/l |
NA |
NA |
Zuurstof_opgelost_mg/l |
NA |
NA |
Zuurstof_opgelost_mg/l |
NA |
NA |
Visualiseer originele data serie
Code
p <- ggplot(df, aes(x = datetime, y = waarde)) +
geom_point(size = 0.1, shape = 1) +
geom_line() +
labs(x = NULL, y = glue::glue("{s_lims$parameter_code} [{s_lims$eenheid_code}]"))
#ggplotly(p)
p
Check op onrealistische waarden
Code
lims_real <- c(0, 12)
df <- df %>% mutate(waarde_realistisch = between(waarde, left = lims_real[1], right = lims_real[2]))
ggplot(df, aes(x = datetime, y = waarde)) +
geom_line() +
geom_point(aes(col = waarde_realistisch, shape = waarde_realistisch)) +
labs(x = NULL, y = "NO3-N [mg/l]") +
scale_color_manual(values = c("red", "black")) +
scale_shape_manual(values = c(1, NA))
Check op flatlines
Code
df <- get_flatline(dataset = df, value_col = "waarde", threshold = 0, n_measurements = 10)
ggplot(filter(df, !is.na(flatline)), aes(x = datetime, y = waarde)) +
geom_line() +
geom_point(aes(col = flatline, shape = flatline)) +
labs(x = NULL, y = "NO3-N [mg/l]") +
scale_color_manual(values = c("black", "red")) +
scale_shape_manual(values = c(NA, 1))
Check overige outliers
Code
df_trans <- filter(df, flatline == FALSE, waarde_realistisch == TRUE) %>%
distinct(datetime, waarde) %>%
group_by(datetime) %>%
summarise(waarde = mean(waarde, na.rm = TRUE)) %>%
ungroup() %>%
transform_data_uv(time_col = 'datetime')
df_trans$features <- slide(df_trans, ~get_tsfeatures(.x$waarde), .before = 9, .complete = FALSE)
df_trans <- df_trans %>% unnest_wider(features)
col_keep <- is.na(df_trans) %>% colMeans()
col_keep <- names(col_keep[col_keep < 1])
col_keep <- col_keep[!col_keep %in% c('datetime', 'waarde', 'time')]
df_testout <- df_trans[, col_keep]
out_forest <- isotree::isolation.forest(data = df_testout, ntrees = 500)
df_testout$datetime <- df_trans$datetime
df_testout$waarde <- df_trans$waarde
df_testout$iforest_score <- predict(out_forest, df_testout)
iforest_th <- quantile(df_testout$iforest_score, probs = 0.99, na.rm = TRUE, names = FALSE)
df_testout$iforest_type <- df_testout$iforest_score >= iforest_th
#plot
ggplot(df_testout, aes(x = datetime, y = waarde)) +
geom_line() +
geom_point(aes(col = iforest_type, shape = iforest_type)) +
# labs(title = par) +
scale_color_manual(values = c("black", "red")) +
scale_shape_manual(values = c(NA, 1))