Skip to content

Commit 5de096c

Browse files
committed
Move simple gitlab scraper to project.
Adds in the same arguments and tries to integrate some parts. The repo list is generated and should_stop is followed. But the gitlab and github scrapings are done sequentially and the gitlab repos aren't saved. Pending work: * Integrate with data type - Github uses integer IDs gitlab uses string so some modification needed - Maybe best to include provider in the ID and move to a string ID - Need to validate the Rust repo that it's really rust etc - Multi-threading to match the performance of github. But the API seems to be cursor based so maybe not possible (there might be a way if it's okay to lose new repos that appear).
1 parent 5aab7fd commit 5de096c

File tree

2 files changed

+115
-0
lines changed

2 files changed

+115
-0
lines changed

src/gitlab/mod.rs

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
use config::Config;
2+
use data::{Data, Repo};
3+
use prelude::*;
4+
use reqwest::blocking::Client;
5+
use serde::Deserialize;
6+
use std::sync::atomic::{AtomicBool, Ordering};
7+
8+
const GITLAB_GRAPHQL_ENDPOINT: &str = "https://gitlab.com/api/graphql";
9+
10+
static USER_AGENT: &str = "rust-repos (https://github.com/rust-ops/rust-repos)";
11+
12+
static GRAPHQL_QUERY_REPOSITORIES: &str = r#"
13+
query ListRustRepos($after: String) {
14+
projects(
15+
first: 50
16+
after: $after
17+
programmingLanguageName: "Rust"
18+
) {
19+
pageInfo {
20+
hasNextPage
21+
endCursor
22+
}
23+
nodes {
24+
id
25+
name
26+
path
27+
webUrl
28+
}
29+
}
30+
}
31+
"#;
32+
33+
#[derive(Debug, Deserialize)]
34+
struct PageInfo {
35+
hasNextPage: bool,
36+
endCursor: Option<String>,
37+
}
38+
39+
#[derive(Debug, Deserialize)]
40+
struct Project {
41+
id: String,
42+
name: String,
43+
path: String,
44+
webUrl: String,
45+
}
46+
47+
#[derive(Debug, Deserialize)]
48+
struct Namespace {
49+
fullPath: String,
50+
}
51+
52+
#[derive(Debug, Deserialize)]
53+
struct Projects {
54+
pageInfo: PageInfo,
55+
nodes: Vec<Project>,
56+
}
57+
58+
#[derive(Debug, Deserialize)]
59+
struct ApiData {
60+
projects: Projects,
61+
}
62+
63+
#[derive(Debug, Deserialize)]
64+
struct GraphQLResponse {
65+
data: Option<ApiData>,
66+
errors: Option<serde_json::Value>,
67+
}
68+
69+
pub fn scrape(data: &Data, config: &Config, should_stop: &AtomicBool) -> Fallible<()> {
70+
let client = Client::new();
71+
72+
let mut after: Option<String> = None;
73+
let mut page = 1;
74+
75+
while !should_stop.load(Ordering::SeqCst) {
76+
println!("Fetching page {page}...");
77+
78+
let variables = serde_json::json!({ "after": after });
79+
80+
let resp: GraphQLResponse = client
81+
.post(GITLAB_GRAPHQL_ENDPOINT)
82+
.json(&serde_json::json!({
83+
"query": GRAPHQL_QUERY_REPOSITORIES,
84+
"variables": variables
85+
}))
86+
.send()?
87+
.json()?;
88+
89+
if let Some(errors) = resp.errors {
90+
eprintln!("GraphQL errors: {errors:#?}");
91+
break;
92+
}
93+
94+
let data = resp.data.expect("No data returned");
95+
println!("{:?}", data);
96+
let projects = data.projects;
97+
98+
let mut last_id = data.get_last_id("gitlab")?.unwrap_or_default();
99+
for project in projects.nodes {
100+
println!("{:?}", project);
101+
}
102+
103+
if !projects.pageInfo.hasNextPage {
104+
println!("No more pages");
105+
break;
106+
}
107+
108+
after = projects.pageInfo.endCursor;
109+
page += 1;
110+
}
111+
112+
Ok(())
113+
}

src/main.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ extern crate serde_json;
3636
mod config;
3737
mod data;
3838
mod github;
39+
mod gitlab;
3940
mod prelude;
4041
mod utils;
4142

@@ -95,6 +96,7 @@ fn app() -> Fallible<()> {
9596
stop.store(true, Ordering::SeqCst);
9697
})?;
9798

99+
gitlab::scrape(&data, &config, &should_stop)?;
98100
github::scrape(&data, &config, &should_stop)?;
99101

100102
Ok(())

0 commit comments

Comments
 (0)