Skip to content

Commit d4c0ecc

Browse files
grooverdanRazvanLiviuVarzaru
authored andcommitted
MDBF-143: Add Infer builder
This preforms static analysis on the MariaDB codebase by maintaining a git source repository as a shared volume. Because static analysis takes time, a lot of time, there is a shared cache volume to store build results from main branches of the codebase so that as much incremental usage can occur. Infer runs in to phases, a capture and an analyze. Infer output are in a result-dir this contains: * report.json - what infer tools use * report.txt - the human readable version of this * capture.db - the sqlite3 version presentation of captured files and the relation to functions definitions. * results.db - the analyze phase outputs Of these, the report.json is desirable as the long term record of vulnerabilities. and the main_diff containing the difference from the last main X.Y branch commit.
1 parent ea4e14f commit d4c0ecc

File tree

6 files changed

+394
-2
lines changed

6 files changed

+394
-2
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import os
2+
3+
from configuration.builders.infra.runtime import (
4+
BuildSequence,
5+
DockerConfig,
6+
InContainer,
7+
)
8+
from configuration.steps.commands.base import URL
9+
from configuration.steps.base import StepOptions
10+
from configuration.steps.commands.packages import SavePackages
11+
from configuration.steps.commands.util import InferScript, PrintEnvironmentDetails
12+
from configuration.steps.remote import ShellStep
13+
14+
15+
def infer(config: DockerConfig):
16+
sequence = BuildSequence()
17+
18+
sequence.add_step(ShellStep(command=PrintEnvironmentDetails()))
19+
20+
sequence.add_step(
21+
InContainer(
22+
docker_environment=config,
23+
step=ShellStep(
24+
command=InferScript("%(prop:branch)s"),
25+
options=StepOptions(
26+
description="running infer analysis",
27+
descriptionDone="infer analysis complete",
28+
),
29+
env_vars=[("JOBS", str("%(prop:jobs)s"))],
30+
),
31+
)
32+
)
33+
34+
sequence.add_step(
35+
InContainer(
36+
docker_environment=config,
37+
step=ShellStep(
38+
command=SavePackages(
39+
packages=["infer_results"],
40+
destination="/packages/%(prop:tarbuildnum)s/logs/%(prop:buildername)s",
41+
),
42+
url=URL(
43+
url=f"{os.environ['ARTIFACTS_URL']}/%(prop:tarbuildnum)s/logs/%(prop:buildername)s",
44+
url_text="Infer artifacts/logs",
45+
),
46+
options=StepOptions(
47+
alwaysRun=True,
48+
description="saving infer analysis results",
49+
descriptionDone="infer analysis results saved",
50+
),
51+
),
52+
)
53+
)
54+
return sequence

configuration/steps/commands/base.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,14 @@ def as_cmd_arg(self) -> list[str]:
6262

6363

6464
class BashCommand(Command):
65-
def __init__(self, cmd: str, name: str = "Run command", user: str = "buildbot"):
66-
super().__init__(name=name, workdir=PurePath("."), user=user)
65+
def __init__(
66+
self,
67+
cmd: str,
68+
name: str = "Run command",
69+
user: str = "buildbot",
70+
workdir: PurePath = PurePath("."),
71+
):
72+
super().__init__(name=name, workdir=workdir, user=user)
6773
self.cmd = cmd
6874

6975
def as_cmd_arg(self) -> list[str]:
Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
#!/bin/bash
2+
3+
# Infer script for performing
4+
# static analysis on the MariaDB codebase
5+
6+
set -x -e
7+
8+
infer --version
9+
10+
if [ $# -lt 1 ]; then
11+
echo insufficient args >&2
12+
exit 1
13+
fi
14+
15+
# Testing this version
16+
branch=$1
17+
18+
if [ -z "$branch" ]; then
19+
echo "usage $0 {branch/commit}" >&2
20+
exit 1
21+
fi
22+
23+
: "${JOBS:=4}"
24+
25+
base=$PWD
26+
result_dir=$PWD/infer_results
27+
infer="/mnt/infer"
28+
29+
rm -rf "${result_dir}" index.txt report.json
30+
31+
## Fetch
32+
33+
pushd /mnt/src
34+
if [ ! -d .git ]; then
35+
git clone https://github.com/MariaDB/server.git
36+
else
37+
git clean -df
38+
fi
39+
git fetch origin "$branch"
40+
git checkout -f FETCH_HEAD
41+
git submodule update --init --recursive --jobs "${JOBS}"
42+
git clean -df
43+
commit=$(git rev-parse FETCH_HEAD)
44+
45+
if [ -d "${infer}/$commit" ]; then
46+
echo "Already scanned $commit"
47+
exit 0
48+
fi
49+
50+
# Directory to clean
51+
# Target maximum usage (in percent)
52+
max_usage=90
53+
54+
# Function to get current disk usage (integer percent)
55+
get_usage() {
56+
df -P "$infer" | awk 'NR==2 {gsub(/%/,""); print $5}'
57+
}
58+
59+
echo "Checking disk usage on $(df -h "$infer" | tail -n -1)"
60+
usage=$(get_usage)
61+
echo "Current usage: ${usage}%"
62+
63+
# Find directories sorted by oldest modification time (oldest first)
64+
mapfile -t dirs < <(
65+
find "$infer" -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' \
66+
| sort -n | awk '{print $2}'
67+
)
68+
69+
# Loop through and delete until below threshold
70+
for dir in "${dirs[@]}"; do
71+
if (( usage < max_usage )); then
72+
echo "Disk usage is ${usage}%, below ${max_usage}%. Done."
73+
break
74+
fi
75+
76+
echo "Deleting oldest directory: $dir"
77+
rm -rf -- "$dir"
78+
79+
usage=$(get_usage)
80+
echo "New usage: ${usage}%"
81+
done
82+
83+
if (( usage >= max_usage )); then
84+
echo "Warning: disk still above ${max_usage}% after deleting all directories!"
85+
else
86+
echo "Done. Disk usage now ${usage}%."
87+
fi
88+
89+
90+
# What can we use as a reference
91+
92+
populate_differences()
93+
# input $merge_base
94+
{
95+
# Find something closer - e.g. we've appended to a branch
96+
# we've already tested
97+
mapfile -t commits < <(git rev-list "${merge_base}..FETCH_HEAD")
98+
for common_commit in "${commits[@]}"; do
99+
if [ -d "${infer}/$common_commit" ]; then
100+
break;
101+
fi
102+
done
103+
if [ ! -d "${infer}/$common_commit" ]; then
104+
return 1
105+
fi
106+
merge_base=$common_commit
107+
# The file changes we from last results
108+
git diff --name-only FETCH_HEAD.."${merge_base}" | tee "$base"/index.txt
109+
110+
if [ ! -s "$base"/index.txt ]; then
111+
echo "Empty changes - nothing necessary"
112+
rm "$base"/index.txt
113+
exit 0
114+
fi
115+
116+
limit=50
117+
if [ "$(wc -l < "${base}"/index.txt)" -gt $limit ]; then
118+
echo "More than $limit changes, just do a full generation"
119+
rm "$base/index.txt"
120+
return 1
121+
fi
122+
123+
# use previous results as a base
124+
cp -a "$infer/$merge_base" "$result_dir"
125+
126+
# Using as a recently used maker
127+
# Eventually we can remove/clear based on not being looked at
128+
touch "$infer/$merge_base"
129+
return 0
130+
}
131+
132+
# Just assume we diverged from main at some point
133+
# Using $commit because merge-base didn't process
134+
# pull request references.
135+
merge_base=$(git merge-base "$commit" origin/main)
136+
137+
if populate_differences; then
138+
echo "No common commit ancestor with analysis or over depth limit($limit)" >&2
139+
140+
echo "This is going to take a while for a full scan"
141+
fi
142+
143+
# back from /mnt/src
144+
popd
145+
146+
# Build
147+
148+
build()
149+
{
150+
cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
151+
-DCMAKE_C_COMPILER=clang \
152+
-DCMAKE_CXX_COMPILER=clang++ \
153+
-S /mnt/src -B bld
154+
cmake --build bld \
155+
--target GenError GenServerSource GenUnicodeDataSource GenFixPrivs \
156+
--parallel "$JOBS"
157+
}
158+
159+
if [ ! -d bld ]; then
160+
mkdir bld
161+
build
162+
fi
163+
164+
#
165+
capture()
166+
{
167+
infer capture --compilation-database compile_commands.json --project-root /mnt/src --results-dir "${result_dir}" "$@"
168+
}
169+
170+
analyze()
171+
{
172+
infer analyze --project-root /mnt/src --results-dir "${result_dir}" --max-jobs "${JOBS}" "$@"
173+
}
174+
# Capture and analyze the feature of the files changes in index
175+
#
176+
cd bld
177+
178+
if [ ! -f ../index.txt ]; then
179+
echo "full run, this could take a while"
180+
capture
181+
analyze
182+
cp -a "$result_dir" "$infer/$commit"
183+
cd ..
184+
else
185+
echo "incremental run"
186+
# We've copied over a result dir, so we're continuing
187+
# https://fbinfer.com/docs/infer-workflow/#differential-workflow
188+
# using 'infer capture" instead infer run
189+
capture --reactive
190+
191+
# some form of incremental
192+
analyze --changed-files-index ../index.txt
193+
194+
# Preserve result
195+
cp "${result_dir}"/report.json ../report.json
196+
197+
# just in case these have changed, including generated files
198+
cd ..
199+
build
200+
cd bld
201+
202+
# Can we use the previous captured $infer/$merge_base
203+
capture --merge-capture "$infer/$merge_base" --reactive --mark-unchanged-procs
204+
205+
analyze --incremental-analysis --changed-files-index ../index.txt
206+
207+
# It may be merged next, or a commit pushed on top of it.
208+
infer reportdiff --report-current ../report.json --report-previous "${result_dir}"/report.json --project-root /mnt/src --results-dir "${result_dir}"
209+
cd ..
210+
## At this point we have infer_results/differential/{fixed,introduced}.json
211+
#!? Change the name as we're going to use differential as a main branch difference
212+
#!!mv "${result_dir}"/differential "${result_dir}"/diff_prev_commit
213+
fi
214+
rm -rf bld index.txt
215+
216+
# Useful enough to save as $infer/
217+
# Its unknown if this is on main branch or now, but just save.
218+
# If its merged next, then a commit exists, if a user appends
219+
# a commit, we've got a smaller delta.
220+
cp -a "${result_dir}" "$infer/${commit}"
221+
222+
# Look at the changes from the main branch
223+
#
224+
# Take the main branch report.json
225+
# remove fixed, add introduced, and then walk
226+
# though other commits, if they exist, and apply the
227+
# same again up until, and including the last commit
228+
source /mnt/src/VERSION
229+
branch=${MYSQL_VERSION_MAJOR}.${MYSQL_VERSION_MINOR}
230+
231+
pushd /mnt/src
232+
merge_base=$(git merge-base "origin/$branch" "$commit")
233+
mapfile -t commits < <(git rev-list "${merge_base}..${commit}")
234+
popd
235+
236+
base=/mnt/infer/$merge_base
237+
last_ref=$base
238+
for common_commit in "${commits[@]}"; do
239+
diff_dir="${infer}/$common_commit"/differential/
240+
if [ -d "$diff_dir" ]; then
241+
# removed fixed issues and append introduced.
242+
jq --slurpfile to_remove "${diff_dir}"/fixed.json '
243+
($to_remove[0] | map(.hash)) as $hashes_to_remove
244+
| map(select(.hash as $h | $hashes_to_remove | index($h) | not))' \
245+
"${last_ref}"/report.json > filtered.json
246+
jq -s 'add | unique_by(.hash)' filtered.json "${diff_dir}"/introduced.json > report.json
247+
last_ref=$PWD
248+
fi
249+
done
250+
251+
infer reportdiff --report-current report.json --report-previous "${base}"/report.json --project-root /mnt/src --results-dir "${result_dir}_diff"
252+
253+
result_dir_main_diff=${result_dir}/main_diff
254+
mv "${result_dir}_diff"/differential/ "${result_dir_main_diff}"
255+
cp -a "${result_dir_main_diff}" "$infer/${commit}"
256+
257+
# cleanup for smaller CI
258+
rm -rf "${result_dir}"/*.db "${result_dir}"/tmp
259+
260+
check()
261+
{
262+
file=$1
263+
msg=$2
264+
if [ -f "${file}" ]; then
265+
filesize=$(stat -c%s "$file")
266+
# 2 is the size of an empty json array '[]'
267+
if [ "$filesize" -gt 2 ]; then
268+
echo "$msg"
269+
echo
270+
echo "Here are the changes:"
271+
jq . "${file}"
272+
return 1
273+
fi
274+
fi
275+
return 0
276+
}
277+
278+
check "${result_dir}"/differential/fixed.json "Good human! Thanks for fixing the bad things in the last commit"
279+
280+
check "${result_dir}"/differential/introduced.json "Bad human! Don't introduce bad things in the last commit" >&2
281+
282+
check "${result_dir_main_diff}"/fixed.json "Good human! Thanks for fixing the bad things"
283+
284+
if check "${result_dir_main_diff}"//introduced.json "Bad human! Don't introduce bad things" >&2; then
285+
exit 1
286+
fi

configuration/steps/commands/util.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,3 +173,12 @@ def __init__(
173173
):
174174
args = [f"{binary}:{','.join(libs)}" for binary, libs in binary_checks.items()]
175175
super().__init__(script_name="ldd_check.sh", args=args)
176+
177+
178+
class InferScript(BashScriptCommand):
179+
"""
180+
A command to run the Infer analysis on the MariaDB codebase.
181+
"""
182+
183+
def __init__(self, branch: str):
184+
super().__init__(script_name="infer.sh", args=[branch])

constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@
177177
"amd64-debian-12-debug-embedded",
178178
"amd64-fedora-41",
179179
"amd64-fedora-42",
180+
"amd64-infer-clang-20",
180181
"amd64-msan-clang-20-debug",
181182
"amd64-opensuse-1506",
182183
"amd64-rhel-10",

0 commit comments

Comments
 (0)