Modelling the rainfall occurrence in France

Let us use the SB distribution to model rain occurrence across France. The data used is extracted from the ECAD database.

It consists in daily rainfall occurrence at D=37 locations in France.

using SpatialBernoulli, Optimization,OptimizationOptimJL, Optim
using CSV,DataFrames
using Random
import ForwardDiff
Random.seed!(1234)

Random.TaskLocalRNG()

Getting the data

station_50Q = CSV.read("data/transformedECAD_stations.csv",DataFrame);
Yobs=Matrix(CSV.read("data/transformedECAD_Yobs.csv",header=false,DataFrame));
my_distance =Matrix(CSV.read("data/transformedECAD_locsdistances.csv",header=false,DataFrame));

37×37 Matrix{Float64}:
   0.0    389.896   307.699  482.271   344.316  …  317.574   422.329  184.346
 389.896    0.0     173.655  155.985   725.256     654.644   543.313  571.883
 307.699  173.655     0.0    327.626   605.781     616.454   369.69   468.686
 482.271  155.985   327.626    0.0     826.539     692.529   696.731  665.99
 344.316  725.256   605.781  826.539     0.0       345.157   486.874  162.749
 394.101  783.497   678.832  871.133   101.807  …  302.446   587.462  212.348
 293.731  627.068   483.636  749.883   169.486     440.745   317.394  181.57
 234.259  546.886   401.97   672.817   230.258     439.144   272.019  177.973
 300.506  456.444   286.283  604.072   417.315     586.314   122.598  344.704
 207.579  498.035   476.966  532.98    398.37      159.817   626.323  257.754
   ⋮                                            ⋱              ⋮      
 461.379   71.9253  221.477  146.955   794.846     724.88    585.041  642.74
 406.969  208.141   345.24   128.604   747.013     577.301   695.839  584.3
 765.765  633.85    780.438  495.006  1058.97   …  773.231  1121.29   906.242
 309.531  348.062   174.423  501.18    514.518     622.706   195.552  412.007
 476.645  688.338   516.76   836.328   410.028     692.304   171.331  429.33
 434.226  724.252   563.543  860.259   273.911     588.111   286.226  333.402
 317.574  654.644   616.454  692.529   345.157       0.0     698.055  263.061
 422.329  543.313   369.69   696.731   486.874  …  698.055     0.0    445.693
 184.346  571.883   468.686  665.99    162.749     263.061   445.693    0.0

First, we split the data in months. Explicit seasonality in the sense of periodic parameterisation is not treated here, see here for using this type of model in a periodic setting, or here for other parameterisations.

my_locations = hcat(station_50Q.LON_idx, station_50Q.LAT_idx)
nlocs = length(my_locations[:, 1])

select_month = function (m::Int64, dates, Y::AbstractMatrix)
    indicesm = findall(month.(dates) .== m)
    return Y[:, indicesm]
end

using Dates
date_start = Date(1973)


date_end = Date(2024) - Day(1)

every_year = date_start:Day(1):date_end
dates = every_year


Ymonths = [select_month(m, every_year, Yobs) for m in 1:12];

12-element Vector{Matrix{Bool}}:
 [0 0 … 1 0; 0 0 … 0 0; … ; 0 0 … 1 1; 0 0 … 0 0]
 [0 0 … 0 0; 0 0 … 0 0; … ; 1 0 … 0 1; 0 0 … 0 0]
 [1 1 … 1 1; 0 1 … 0 0; … ; 0 0 … 1 1; 1 0 … 1 1]
 [0 1 … 0 0; 0 1 … 1 1; … ; 0 1 … 1 0; 1 1 … 0 0]
 [1 1 … 0 0; 1 0 … 0 1; … ; 1 0 … 0 0; 1 1 … 0 0]
 [1 0 … 0 1; 1 0 … 1 1; … ; 0 1 … 0 1; 1 1 … 0 1]
 [0 0 … 0 1; 0 0 … 0 0; … ; 0 0 … 0 1; 0 0 … 0 1]
 [1 0 … 0 1; 1 0 … 0 0; … ; 0 1 … 1 1; 1 1 … 0 1]
 [0 0 … 0 1; 0 0 … 0 0; … ; 0 0 … 0 0; 0 0 … 0 1]
 [0 0 … 1 1; 1 1 … 1 0; … ; 0 0 … 1 1; 0 0 … 1 1]
 [0 0 … 1 1; 0 0 … 1 1; … ; 0 0 … 1 1; 0 0 … 1 1]
 [0 0 … 1 1; 1 0 … 0 1; … ; 1 0 … 1 1; 0 0 … 1 1]

Inference for each month

We can now estimate model parameters for each month, using the fit_mle_vfast method. The results are saved to the data folder. This is easily multi-thread-able.

using JLD2
using LineSearches

for imonth in 1:12
    y = Ymonths[imonth]
    n = length(y[1, :])


    init_range = 600
    init_order = 0.5
    init_lambda = fill(0.4, nlocs)
    init_d = SB(init_range, 1.0, init_order, init_lambda, my_distance)

    #pairwise solution
    tdist = maximum(my_distance) / 3
    wp = 1.0 .* (my_distance .< tdist)

    @time sol_fixednu = fit_mle_vfast(init_d, y, wp;solver = Optim.LBFGS(
    linesearch = LineSearches.BackTracking()
)
,  order=1/2,maxiters = 100, return_sol=false)

    # Save to JLD2 file
    save("data/vfastfitted_month_QMC100" * string(imonth) * ".jld2", Dict("d" => sol_fixednu))

end

  7.739438 seconds (18.82 M allocations: 911.995 MiB, 5.44% gc time, 98.23% compilation time: <1% of which was recompilation)
  0.139211 seconds (150.93 k allocations: 7.317 MiB)
  0.222412 seconds (183.58 k allocations: 9.617 MiB)
  0.136025 seconds (130.64 k allocations: 6.443 MiB)
  0.156769 seconds (134.27 k allocations: 6.698 MiB)
  0.173542 seconds (161.81 k allocations: 8.083 MiB)
  0.259515 seconds (198.07 k allocations: 10.637 MiB)
  0.169386 seconds (141.52 k allocations: 7.209 MiB)
  0.137833 seconds (123.39 k allocations: 5.932 MiB)
  0.172875 seconds (154.56 k allocations: 7.573 MiB)
  0.158121 seconds (154.56 k allocations: 7.573 MiB)
  0.155842 seconds (150.93 k allocations: 7.317 MiB)

Visualise the results : parameters

vec_models_vfast = Vector{SB}(undef, 12)
for imonth in 1:12
    vec_models_vfast[imonth] = load("data/vfastfitted_month_QMC100" * string(imonth) * ".jld2")["d"]
end
ntot = length(Yobs[1,:])
using CairoMakie

function plot_parameters(models::Vector{SB})
    months=1:12
month_labels = ["Jan","Feb","Mar","Apr","May","Jun",
                "Jul","Aug","Sep","Oct","Nov","Dec"]
    ρ = [m.range for m in models]
    λ = hcat([m.λ for m in models]...)  # (n_sites × n_months)

    fig = Figure(size = (900, 500))
    ax1 = Axis(fig[1, 1],
        xlabel = "Month",
        ylabel = L"\rho",
        title  = L"Estimated range parameter $\rho$",
            xticks = (months, month_labels)

    )
    scatter!(ax1, months, ρ)
    lines!(ax1, months, ρ)

    ax2 = Axis(fig[1, 2],
        xlabel = "Month",
        ylabel = L"\lambda_s",
        title  = L"Estimated rain probabilities $\lambda_s$",
            xticks = (months, month_labels)

    )
    for s in 1:size(λ, 1)
        lines!(ax2, months, λ[s, :], linewidth = 1, alpha = 0.6)
    end

    fig
end




plot_parameters(vec_models_vfast)

Visualise the results : comparing simulations to observations

The goal of stochastic weather generators is to be able to simulate quickly many plausible sequences of the meteorological variables, sharing the statistical properties of the observations.

An objective of this secific model is to accurately reproduce large-scale spatial events, such as widespread dry or wet days. We define the ROR indicator as ROR(n) = \frac{1}{D}\sum_{s=1}^D Y_s^{(n)} A low (high) value of ROR(n) denotes a dry (wet) day for many locations at the same time.

The distribution in the observations is compared to the distribution evaluated from many simulations, as well as the autocorrelation of this indicator, for each season.

Unsuprisingly, there is a clear lack of temporal dependance when using a simple SpatialBernoulli, suggesting the need for a structured model (such as HMMs! ). However, the distribution of the indicator itself is not that far off.

Nb = 10
D = 37
using StatsBase
using LinearAlgebra, NaNMath

begin
    Ysvf = zeros(Bool, nlocs, ntot, Nb)
    @time "Simulations  Y" for i in 1:Nb
        for t in 1:ntot
        m = month(every_year[t])
        Ysvf[:, t, i] = rand(vec_models_vfast[m]);
        end
    end
end

RRmax = 0
RORo = [mean(r .> RRmax) for r in eachcol(Yobs)]
RORs = [[mean(r .> RRmax) for r in eachcol(rr)] for rr in eachslice(Ysvf, dims=3)]

maxlag = 10

include("assets/utilities.jl")
begin
    # Makie ROR distribution and autocorrelation (2×4 grid)
JJA = [6, 7, 8]
MAM = [3, 4, 5]
SON = [9, 10, 11]
DJF = [12, 1, 2]
SEASONS = [DJF, MAM, JJA, SON]
seasonname = ["DJF", "MAM", "JJA", "SON"]
idx_seasons = [findall(month.(every_year) .∈ tuple(season)) for season in SEASONS]
    fig_ROR = Figure(fontsize=19)
    wwww = 200
    hhhh = 150
    # Row 1: Distribution plots
    for m in eachindex(idx_seasons)
        row = 1
        col = m

        # Distribution subplot
        ax_dist = Axis(fig_ROR[row, col],
            xlabel="ROR",
            ylabel=col == 1 ? "Distribution" : "",
            title=seasonname[m],
            width=wwww,
            height=hhhh)
        xax = 0:(1/D):1.0
        xaxbin = vcat(xax, [1.01])
        errorlinehist!(ax_dist, [RORs[i][idx_seasons[m]] for i in 1:Nb];
            label="",
            color=:gray,
            secondarycolor=:gray, normalization=:probability,
            bins=xaxbin,
            errortype=:percentile,
            percentiles=[0, 100],
            alpha=0.5,
            secondaryalpha=0.2,
            centertype=:median)
      scatter!(ax_dist, xax, [mean(RORo[idx_seasons[m]] .== x) for x in xax], color=:blue, markersize=6, label=m == 1 ? "Observations" : nothing)

        ylims!(ax_dist, 0, 0.06)
        col > 1 && hideydecorations!(ax_dist, grid=false)

        # Autocorrelation subplot
        row = 2
        ax_acf = Axis(fig_ROR[row, col],
            xlabel="Lag",
            ylabel=col == 1 ? "ACF" : "",
            width=wwww,
            height=hhhh,
        )

        rorsim = [RORs[i][idx_seasons[m]] for i in 1:Nb]
        acf_sim = [autocor(rorsim[i], 0:maxlag) for i in 1:length(rorsim)]
        errorline!(ax_acf, 0:maxlag, stack(acf_sim, dims=1)',
            color=:gray,
            secondarycolor=:gray,
            errortype=:percentile,
            percentiles=[0, 100],
            secondaryalpha=0.2,
            centertype=:median)

        # Observations
        acf_obs = autocor(RORo, 0:maxlag)
        scatter!(ax_acf, 0:maxlag, acf_obs, color=:blue, markersize=7)
        col > 1 && hideydecorations!(ax_acf, grid=false)
    end

    Legend(
    fig_ROR[:, 5],
    [
        [LineElement(color=:gray), PolyElement(color=:gray, alpha=0.2)],

        MarkerElement(color=:blue, marker=:circle, markersize=8)
    ],
    [
        "SB",
        "Observations"
    ]
)

    resize_to_layout!(fig_ROR)
    fig_ROR
end