Skip to content

Fix MatrixDepot benchmark CI failures and improve error handling #1341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 92 additions & 15 deletions benchmarks/LinearSolve/MatrixDepot.jmd
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,21 @@ using LinearAlgebra, SparseArrays, LinearSolve, Sparspak
import Pardiso
using Plots
using MatrixDepot
using Dates

BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5

# Why do I need to set this ?
BenchmarkTools.DEFAULT_PARAMETERS.samples = 10

# Set a reasonable timeout for each benchmark
BenchmarkTools.DEFAULT_PARAMETERS.time_tolerance = 0.05
BenchmarkTools.DEFAULT_PARAMETERS.memory_tolerance = 0.01

# Start time for tracking
start_time = now()
last_heartbeat = now()

algs = [
UMFPACKFactorization(),
KLUFactorization(),
Expand All @@ -29,7 +38,14 @@ cols = [:red, :blue, :green, :magenta, :turqoise] # one color per alg
# matrices = ["HB/1138_bus", "HB/494_bus", "HB/662_bus", "HB/685_bus", "HB/bcsstk01", "HB/bcsstk02", "HB/bcsstk03", "HB/bcsstk04", "HB/bcsstk05", "HB/bcsstk06", "HB/bcsstk07", "HB/bcsstk08", "HB/bcsstk09", "HB/bcsstk10", "HB/bcsstk11", "HB/bcsstk12", "HB/bcsstk13", "HB/bcsstk14", "HB/bcsstk15", "HB/bcsstk16"]
allmatrices_md = listnames("*/*")

@info "Total number of matrices: $(allmatrices_md.content[1].rows)"
total_matrices = length(allmatrices_md.content[1].rows)
@info "Total number of matrices: $total_matrices"

# Track progress and failures
processed_count = 0
failed_matrices = String[]
successful_matrices = String[]
skipped_large_matrices = String[]

times = fill(NaN, length(allmatrices_md.content[1].rows), length(algs))
percentage_sparsity = fill(NaN, length(allmatrices_md.content[1].rows))
Expand Down Expand Up @@ -62,7 +78,32 @@ end
```

```julia
for z in 1:length(allmatrices_md.content[1].rows)
for z in 1:total_matrices
# Early termination if too many consecutive failures
if length(failed_matrices) > 100
@warn "Too many failures (>100), terminating benchmark early to prevent CI timeout"
break
end

# Heartbeat every 30 seconds to prevent CI timeout
current_time = now()
if current_time - last_heartbeat > Dates.Second(30)
elapsed = round((current_time - start_time) / Dates.Minute(1), digits=1)
@info "Heartbeat: Still running... ($(elapsed) minutes elapsed, matrix $z/$total_matrices)"
last_heartbeat = current_time
flush(stdout)
flush(stderr)
end

# Progress tracking - print every 10 matrices or on first/last
if z == 1 || z == total_matrices || z % 10 == 0
@info "Progress: Processing matrix $z of $total_matrices ($(round(100*z/total_matrices, digits=1))%)"
@info " - Successful: $(length(successful_matrices))"
@info " - Failed: $(length(failed_matrices))"
@info " - Skipped (too large): $(length(skipped_large_matrices))"
flush(stdout)
flush(stderr)
end
try
matrix = allmatrices_md.content[1].rows[z]
matrix = string(matrix[1])
Expand All @@ -76,8 +117,15 @@ for z in 1:length(allmatrices_md.content[1].rows)

mtx_copy = copy(A)

@info "$n × $n"
n > 100 && error("Skipping too large matrices")
# Check matrix size and skip if too large
if n > 1500
@info "Matrix $currMTX ($n × $n) is too large, skipping..."
push!(skipped_large_matrices, currMTX)
processed_count += 1
continue
end

@info "Processing $currMTX: $n × $n matrix"

## COMPUTING SPACED OUT SPARSITY
rows, cols = size(mtx_copy)
Expand Down Expand Up @@ -105,12 +153,18 @@ for z in 1:length(allmatrices_md.content[1].rows)
u0 = rand(rng, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy($A),
copy($b);
u0 = copy($u0),
alias_A = true,
alias_b = true))
times[z, j] = bt
try
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy($A),
copy($b);
u0 = copy($u0),
alias_A = true,
alias_b = true))
times[z, j] = bt
catch alg_error
# Silently record NaN for failed algorithms
times[z, j] = NaN
@debug "Algorithm $(algnames[j]) failed on $currMTX: $(typeof(alg_error))"
end
end

bandedness_five[z] = compute_bandedness(A, 5)
Expand All @@ -130,18 +184,41 @@ for z in 1:length(allmatrices_md.content[1].rows)
display(p)
=#

println("successfully factorized $(currMTX)")
push!(successful_matrices, currMTX)
processed_count += 1
@debug "Successfully factorized $currMTX"
catch e
matrix = allmatrices_md.content[1].rows[z]
matrix = string(matrix[1])

currMTX = matrix

println("$(currMTX) failed to factorize.")
println(e)

push!(failed_matrices, currMTX)
processed_count += 1

# Only print brief error info, not full stacktrace
error_type = typeof(e)
@warn "Matrix $currMTX failed: $error_type"
end
end

# Final summary
total_elapsed = round((now() - start_time) / Dates.Minute(1), digits=1)
@info "="^60
@info "Benchmark Complete!"
@info "Total runtime: $total_elapsed minutes"
@info "Total matrices processed: $processed_count / $total_matrices"
@info "Successful: $(length(successful_matrices))"
@info "Failed: $(length(failed_matrices))"
@info "Skipped (too large): $(length(skipped_large_matrices))"
@info "="^60

# Print failed matrices list if not too many
if length(failed_matrices) > 0 && length(failed_matrices) <= 20
@info "Failed matrices: $(join(failed_matrices, ", "))"
elseif length(failed_matrices) > 20
@info "Failed matrices (first 20): $(join(failed_matrices[1:20], ", "))..."
end

percentage_sparsity = percentage_sparsity[.!isnan.(percentage_sparsity)]
spaced_out_sparsity = spaced_out_sparsity[.!isnan.(spaced_out_sparsity)]
spaced_out_sparsity = replace(spaced_out_sparsity, 0 => 1e-10)
Expand Down
Loading