Skip to content

Commit 71e35c2

Browse files
author
Gal Ben David
committed
improved multithreading performance. scan methods are now interruptable
1 parent e2146da commit 71e35c2

File tree

4 files changed

+107
-40
lines changed

4 files changed

+107
-40
lines changed

Cargo.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "pyrepscan"
3-
version = "0.8.0"
3+
version = "0.9.0"
44
authors = ["Gal Ben David <[email protected]>"]
55
edition = "2018"
66
description = "A Git Repository Secrets Scanner written in Rust"
@@ -36,10 +36,11 @@ crate-type = ["cdylib"]
3636

3737
[dependencies]
3838
regex = "1"
39-
rayon = "1.5"
4039
chrono = "0.4"
41-
parking_lot = "0.11"
4240
num_cpus = "1"
41+
parking_lot = "0.11"
42+
crossbeam = "0.8"
43+
crossbeam-utils = "0.8"
4344

4445
[dependencies.git2]
4546
version = "0.13"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ strip = true
1414

1515
[tool.poetry]
1616
name = "pyrepscan"
17-
version = "0.8.0"
17+
version = "0.9.0"
1818
authors = ["Gal Ben David <[email protected]>"]
1919
description = "A Git Repository Secrets Scanner written in Rust"
2020
readme = "README.md"

src/git_repository_scanner.rs

Lines changed: 87 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,25 @@
11
use crate::rules_manager;
22

33
use chrono::prelude::*;
4+
use crossbeam_utils::thread as crossbeam_thread;
5+
use crossbeam::queue::SegQueue;
46
use git2::{Oid, Repository, Delta};
5-
use git2::Error;
67
use parking_lot::Mutex;
7-
use rayon::prelude::*;
8+
use pyo3::prelude::*;
89
use std::collections::HashMap;
910
use std::path::Path;
1011
use std::sync::Arc;
12+
use std::sync::atomic::{AtomicBool, Ordering};
13+
use std::thread;
14+
use std::time;
1115

1216
fn scan_commit_oid(
17+
should_stop: &AtomicBool,
1318
git_repo: &Repository,
1419
oid: &Oid,
1520
rules_manager: &rules_manager::RulesManager,
1621
output_matches: Arc<Mutex<Vec<HashMap<&str, String>>>>,
17-
) -> Result<(), Error> {
22+
) -> Result<(), git2::Error> {
1823
let commit = git_repo.find_commit(*oid)?;
1924

2025
let commit_parent_count = commit.parent_count();
@@ -34,6 +39,10 @@ fn scan_commit_oid(
3439
};
3540

3641
for delta in commit_diff.deltas() {
42+
if should_stop.load(Ordering::Relaxed) {
43+
break;
44+
}
45+
3746
match delta.status() {
3847
Delta::Added | Delta::Modified => {},
3948
_ => continue,
@@ -119,25 +128,22 @@ fn scan_commit_oid(
119128
pub fn get_file_content(
120129
repository_path: &str,
121130
file_oid: &str,
122-
) -> Result<Vec<u8>, Error> {
131+
) -> Result<Vec<u8>, git2::Error> {
123132
let git_repo = Repository::open(repository_path)?;
124133
let oid = Oid::from_str(file_oid)?;
125134
let blob = git_repo.find_blob(oid)?;
126135

127136
Ok(blob.content().to_vec())
128137
}
129138

130-
pub fn scan_repository(
139+
fn get_oids(
131140
repository_path: &str,
132141
branch_glob_pattern: &str,
133142
from_timestamp: i64,
134-
rules_manager: &rules_manager::RulesManager,
135-
output_matches: Arc<Mutex<Vec<HashMap<&str, String>>>>,
136-
) -> Result<(), Error> {
143+
) -> Result<Vec<Oid>, git2::Error>{
137144
let git_repo = Repository::open(repository_path)?;
138145

139146
let mut revwalk = git_repo.revwalk()?;
140-
141147
revwalk.push_head()?;
142148
revwalk.set_sorting(git2::Sort::TIME)?;
143149
revwalk.push_glob(branch_glob_pattern)?;
@@ -153,22 +159,79 @@ pub fn scan_repository(
153159
}
154160
}
155161

156-
let chunk_size = (oids.len() as f64 / (num_cpus::get() * 5) as f64).ceil() as usize;
157-
if !oids.is_empty() {
158-
oids.par_chunks(chunk_size).for_each(
159-
|oids| {
160-
let git_repo = Repository::open(repository_path).unwrap();
161-
for oid in oids {
162-
scan_commit_oid(
163-
&git_repo,
164-
oid,
165-
rules_manager,
166-
output_matches.clone()
167-
).unwrap_or(());
168-
}
169-
},
170-
);
162+
Ok(oids)
163+
}
164+
165+
pub fn scan_repository(
166+
py: &Python,
167+
repository_path: &str,
168+
branch_glob_pattern: &str,
169+
from_timestamp: i64,
170+
rules_manager: &rules_manager::RulesManager,
171+
output_matches: Arc<Mutex<Vec<HashMap<&str, String>>>>,
172+
) -> Result<(), PyErr> {
173+
let oids_queue = Arc::new(SegQueue::new());
174+
match get_oids(
175+
repository_path,
176+
branch_glob_pattern,
177+
from_timestamp
178+
) {
179+
Ok(oids) => {
180+
for oid in oids {
181+
oids_queue.push(oid);
182+
}
183+
},
184+
Err(error) => {
185+
return Err(pyo3::exceptions::PyRuntimeError::new_err(error.to_string()))
186+
},
171187
}
188+
py.check_signals()?;
189+
190+
let mut py_signal_error: PyResult<()> = Ok(());
191+
192+
crossbeam_thread::scope(
193+
|scope| {
194+
let should_stop = Arc::new(AtomicBool::new(false));
195+
196+
for _ in 0..num_cpus::get() {
197+
let output_matches = output_matches.clone();
198+
let oids_queue = oids_queue.clone();
199+
let should_stop = should_stop.clone();
200+
scope.spawn(
201+
move |_| {
202+
if let Ok(git_repo) = Repository::open(repository_path) {
203+
while !should_stop.load(Ordering::Relaxed) {
204+
if let Some(oid) = oids_queue.pop() {
205+
scan_commit_oid(
206+
&should_stop,
207+
&git_repo,
208+
&oid,
209+
rules_manager,
210+
output_matches.clone(),
211+
).unwrap_or(());
212+
} else {
213+
break;
214+
}
215+
}
216+
};
217+
}
218+
);
219+
}
220+
221+
while !oids_queue.is_empty() {
222+
py_signal_error = py.check_signals();
223+
if py_signal_error.is_err() {
224+
should_stop.store(true, Ordering::Relaxed);
225+
226+
break;
227+
}
228+
229+
thread::sleep(time::Duration::from_millis(100));
230+
}
231+
}
232+
).unwrap_or(());
233+
234+
py_signal_error?;
172235

173236
Ok(())
174237
}

src/lib.rs

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
mod git_repository_scanner;
22
mod rules_manager;
33

4-
use git2::Repository;
54
use parking_lot::Mutex;
65
use pyo3::exceptions;
76
use pyo3::prelude::*;
87
use pyo3::types::PyBytes;
98
use std::collections::HashMap;
9+
use std::path::PathBuf;
1010
use std::sync::Arc;
1111

1212
/// GitRepositoryScanner class
@@ -201,17 +201,16 @@ impl GitRepositoryScanner {
201201
from_timestamp: Option<i64>,
202202
) -> PyResult<Py<PyAny>> {
203203
let matches = Arc::new(Mutex::new(Vec::<HashMap<&str, String>>::with_capacity(10000)));
204-
205-
if let Err(error) = git_repository_scanner::scan_repository(
204+
match git_repository_scanner::scan_repository(
205+
&py,
206206
repository_path,
207207
branch_glob_pattern.unwrap_or("*"),
208208
from_timestamp.unwrap_or(0),
209209
&self.rules_manager,
210210
matches.clone(),
211211
) {
212-
Err(exceptions::PyRuntimeError::new_err(error.to_string()))
213-
} else {
214-
Ok(matches.lock().to_object(py))
212+
Ok(_) => Ok(matches.lock().to_object(py)),
213+
Err(error) => Err(error),
215214
}
216215
}
217216

@@ -242,22 +241,26 @@ impl GitRepositoryScanner {
242241
branch_glob_pattern: Option<&str>,
243242
from_timestamp: Option<i64>,
244243
) -> PyResult<Py<PyAny>> {
245-
let matches = Arc::new(Mutex::new(Vec::<HashMap<&str, String>>::with_capacity(10000)));
244+
let mut repository_full_path = PathBuf::from(repository_path);
245+
repository_full_path.push(url.split('/').last().unwrap_or(""));
246246

247-
if let Err(error) = Repository::clone(url, repository_path) {
247+
let mut builder = git2::build::RepoBuilder::new();
248+
builder.bare(true);
249+
if let Err(error) = builder.clone(url, repository_full_path.as_path()) {
248250
return Err(exceptions::PyRuntimeError::new_err(error.to_string()));
249251
};
250252

251-
if let Err(error) = git_repository_scanner::scan_repository(
253+
let matches = Arc::new(Mutex::new(Vec::<HashMap<&str, String>>::with_capacity(10000)));
254+
match git_repository_scanner::scan_repository(
255+
&py,
252256
repository_path,
253257
branch_glob_pattern.unwrap_or("*"),
254258
from_timestamp.unwrap_or(0),
255259
&self.rules_manager,
256260
matches.clone(),
257261
) {
258-
Err(exceptions::PyRuntimeError::new_err(error.to_string()))
259-
} else {
260-
Ok(matches.lock().to_object(py))
262+
Ok(_) => Ok(matches.lock().to_object(py)),
263+
Err(error) => Err(error),
261264
}
262265
}
263266
}

0 commit comments

Comments
 (0)