voorbeeld resultaat goorloop EC

voorbeeld resultaat goorloop EC

Code
library(tidyverse)
library(leaflet)
library(sf)
library(plotly)
# library(anomalize)
library(slider)
library(anomaly)

# read custom functions
source('../scripts/test_get_tsfeatures.R')

Data inlezen

Code
locs <- read_csv2("../data/raw/locatie_mapping.csv")
sensor_lims <- read_csv2("../data/raw/meet_range_sensoren.csv") %>% 
  filter(path_length %in% c(1, NA))
par_ranges <- readxl::read_excel('../data/raw/validation_master_v1.xlsx', sheet = 'realistic_range')
df_gl <-
  arrow::read_parquet("../data/modified/sensor_data_tot.parquet", as_data_frame = FALSE) %>%
  filter(project == "Project Goorloop") %>%
  collect() %>%
  mutate(
    sensor = case_when(
      sensor == 'C4E' ~ 'Ponsel C4E',
      sensor == 'OPTOD' ~ 'Ponsel OPTOD',
      sensor == 'PHEHT' ~ 'Ponsel PHEHT',
      TRUE ~ sensor
    )
  )
# df_extra <- read_csv("../data/intermediate/data_voorbereidingworkshop_extra.csv") %>%
#    mutate(
#     sensor = case_when(
#       sensor == 'C4E' ~ 'Ponsel C4E',
#       sensor == 'PHEHT' ~ 'Ponsel PHEHT',
#       TRUE ~ sensor
#     ),
#     datetime = ymd_hms(datetime, tz = 'CET')
#   )
df <- filter(df_gl, parameter_combi == 'oGOORLO540 - 2518012504 WA1111-EU / EGV (Manta)(uS/cm)')  

s_lims <- semi_join(sensor_lims, df)
lims <- c(s_lims$ondergrens, s_lims$bovengrens)

# oh <- read_csv('../data/raw/data_oh_voor_workshop.csv') %>% 
#   mutate(datum = ymd_hms(datum, tz = 'CET'))
# 
# oh_saqc <- oh %>% select(datum_start = datum) %>% 
#   mutate(maint = datum_start + hours(3))
# 
# df_lab_cor <- read_csv('../data/raw/lab_cor_data_workshop.csv') %>% 
#   mutate(datetime_compare = ymd_hms(datetime_compare, tz = 'CET'))

criteria tabellen

Code
sensor_lims %>%
  knitr::kable(caption = glue::glue("Sensor ranges, dit geval: {s_lims$sensor} {s_lims$parameter_code} {s_lims$hoedanigheid_code}"))
Sensor ranges, dit geval: Eureka Manta +35 GELDHD NVT
sensor parameter_code hoedanigheid_code eenheid_code ondergrens bovengrens path_length
Ponsel PHEHT pH NVT DIMSLS 0 14 NA
Ponsel PHEHT T NVT oC -5 50 NA
Ponsel C4E T NVT oC -5 50 NA
Ponsel C4E GELDHD NVT uS/cm 0 2000 NA
Ponsel OPTOD O2 NVT mg/l 0 20 NA
Ponsel OPTOD O2 NVT % 0 200 NA
Ponsel OPTOD T NVT oC -5 50 NA
Eureka Manta +35 pH NVT DIMSLS 0 14 NA
Eureka Manta +35 GELDHD NVT uS/cm 0 5000 NA
Eureka Manta +35 O2 NVT mg/l 0 50 NA
Eureka Manta +35 O2 NVT % 0 500 NA
Eureka Manta +35 T NVT oC -5 50 NA
YSI EXO 3 pH NVT DIMSLS 0 14 NA
YSI EXO 3 GELDHD NVT mS/cm 0 100 NA
YSI EXO 3 GELDHD NVT uS/cm 0 100000 NA
YSI EXO 3 O2 NVT mg/l 0 50 NA
YSI EXO 3 O2 NVT % 0 500 NA
YSI EXO 3 T NVT oC -5 50 NA
Trios NICO NO3 nf mg/l 0 266 1
Trios NICO NO3 Nnf mg/l 0 60 1
OTT EcoN NO3 nf mg/l 0 266 1
OTT EcoN NO3 Nnf mg/l 0 60 1
YSI EXO 2 pH NVT DIMSLS 0 14 NA
YSI EXO 2 GELDHD NVT uS/cm 0 100000 NA
YSI EXO 2 O2 NVT mg/l 0 50 NA
YSI EXO 2 O2 NVT % 0 500 NA
YSI EXO 2 T NVT oC -5 50 NA

Code
par_ranges %>% select(parameter_id, jaar_min, jaar_max) %>% knitr::kable(caption = "Realistische ranges per parameter, dit geval Geleidendheid_25oC_uS/cm")
Realistische ranges per parameter, dit geval Geleidendheid_25oC_uS/cm
parameter_id jaar_min jaar_max
Ammonium_mg/l 0.00 1000.0
Ammonium_N_mg/l_AW 0.00 1000.0
Ammonium_N_mg/l_OW 0.00 1000.0
Chlorofyl-a_blauwalg_ug/l 0.00 5000.0
Chlorofyl-a_groenalg_ug/l 0.00 5000.0
Doorzicht_dm 0.00 617.0
Geleidendheid_25oC_uS/cm 100.00 1000.0
Geleidendheid_25oC_mS/m 10.00 100.0
Lichtintensiteit_1/m 0.00 28000.0
Lichtintensiteit_ref_A_212_nm_SNR 0.00 28000.0
Lichtintensiteit_ref_B_254_nm_SNR 0.00 28000.0
Lichtintensiteit_ref_C_360_nm_SNR 0.00 28000.0
Lichtintensiteit_ref_D_reference_diode_SNR 0.00 28000.0
Nitraat_mg/l 0.00 50.0
Nitraat_N_mg/l 0.00 50.0
pH 2.00 13.0
Ptot_P_mg/l 0.00 4.0
Redoxpotentiaal_mV -400.00 800.0
Relative_fluoresence_units_blauwalg 0.00 100.0
Relative_fluoresence_units_groenalg 0.00 100.0
SAC_254_nm 0.00 1500.0
Saliniteit_PSU 0.05 0.5
SQI 0.00 1.0
Temperatuur_oC_OW NA NA
Totaal_opgeloste_bestanddelen_mg/l 0.00 500.0
Troebelheid_NTU 0.00 10000.0
TRP_mg/l 0.00 1.0
Zuurstof_verzadiging_% 0.00 300.0
Zuurstof_opgelost_mg/l NA NA
Chlorofyl-a_blauwalg_ug/l 0.00 400.0
Chlorofyl-a_blauwalg_ug/l 0.00 400.0
Chlorofyl-a_groenalg_ug/l 0.00 400.0
Chlorofyl-a_groenalg_ug/l 0.00 400.0
Geleidendheid_25oC_mS/m 25.00 100.0
Geleidendheid_25oC_mS/m 25.00 250.0
Geleidendheid_25oC_mS/m 30.00 260.0
Geleidendheid_25oC_mS/m 43.00 54.0
Geleidendheid_25oC_mS/m 20.00 83.0
Temperatuur_oC_OW 1.00 28.0
Temperatuur_oC_OW 6.00 28.0
Temperatuur_oC_OW 9.00 30.0
Temperatuur_oC_OW 4.00 28.0
Temperatuur_oC_OW 4.00 28.0
pH 6.00 11.0
pH 6.00 11.0
pH 6.00 9.0
pH 7.00 11.0
pH 7.00 11.0
Zuurstof_verzadiging_% 0.00 180.0
Zuurstof_verzadiging_% 0.00 130.0
Zuurstof_verzadiging_% 0.00 100.0
Zuurstof_verzadiging_% 0.00 250.0
Zuurstof_verzadiging_% 0.00 250.0
Zuurstof_opgelost_mg/l NA NA
Zuurstof_opgelost_mg/l NA NA
Zuurstof_opgelost_mg/l NA NA
Zuurstof_opgelost_mg/l NA NA
Zuurstof_opgelost_mg/l NA NA

Visualiseer originele data serie

Code
p <- ggplot(df, aes(x = datetime, y = waarde)) +
  geom_point(size = 0.1, shape = 1) +
  geom_line() +
  labs(x = NULL, y = glue::glue("{s_lims$parameter_code} [{s_lims$eenheid_code}]"))
 
#ggplotly(p)
p

Check op onrealistische waarden

Code
lims_real <- c(100, 1000)
df <- df %>% mutate(waarde_realistisch = between(waarde, left = lims_real[1], right = lims_real[2]))

ggplot(df, aes(x = datetime, y = waarde)) +
  geom_line() +
      geom_point(aes(col = waarde_realistisch, shape = waarde_realistisch)) +
      labs(x = NULL, y = "NO3-N [mg/l]") +
      scale_color_manual(values = c("red", "black")) +
      scale_shape_manual(values = c(1, NA))

Check op flatlines

Code
df <- get_flatline(dataset = df, value_col = "waarde", threshold = 0, n_measurements = 10)


ggplot(filter(df, !is.na(flatline)), aes(x = datetime, y = waarde)) +
  geom_line() +
      geom_point(aes(col = flatline, shape = flatline)) +
      labs(x = NULL, y = "NO3-N [mg/l]") +
      scale_color_manual(values = c("black", "red")) +
      scale_shape_manual(values = c(NA, 1))

Check overige outliers

Code
df_trans <- filter(df, flatline == FALSE, waarde_realistisch == TRUE) %>% 
  distinct(datetime, waarde) %>% 
    group_by(datetime) %>%
    summarise(waarde = mean(waarde, na.rm = TRUE)) %>%
    ungroup() %>%
    transform_data_uv(time_col = 'datetime')

  df_trans$features <- slide(df_trans, ~get_tsfeatures(.x$waarde), .before = 9, .complete = FALSE)
  df_trans <- df_trans %>% unnest_wider(features)
  
col_keep <- is.na(df_trans) %>% colMeans()  
col_keep <- names(col_keep[col_keep < 1])
col_keep <- col_keep[!col_keep %in%  c('datetime', 'waarde', 'time')] 
df_testout <- df_trans[, col_keep]

out_forest <- isotree::isolation.forest(data = df_testout, ntrees = 500)

df_testout$datetime <- df_trans$datetime
df_testout$waarde <- df_trans$waarde
df_testout$iforest_score <- predict(out_forest, df_testout)
iforest_th <- quantile(df_testout$iforest_score, probs = 0.99, na.rm = TRUE, names = FALSE)
df_testout$iforest_type <- df_testout$iforest_score >= iforest_th

#plot
ggplot(df_testout, aes(x = datetime, y = waarde)) +
  geom_line() +
      geom_point(aes(col = iforest_type, shape = iforest_type)) +
      # labs(title = par) +
      scale_color_manual(values = c("black", "red")) +
      scale_shape_manual(values = c(NA, 1))