Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
434 changes: 432 additions & 2 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ crates_io_docs_rs = { path = "crates/crates_io_docs_rs" }
crates_io_env_vars = { path = "crates/crates_io_env_vars" }
crates_io_github = { path = "crates/crates_io_github" }
crates_io_index = { path = "crates/crates_io_index" }
crates_io_linecount = { path = "crates/crates_io_linecount" }
crates_io_markdown = { path = "crates/crates_io_markdown" }
crates_io_og_image = "=0.2.1"
crates_io_pagerduty = { path = "crates/crates_io_pagerduty" }
Expand Down
2 changes: 2 additions & 0 deletions crates/crates_io_database/src/models/version.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ pub struct Version {
pub documentation: Option<String>,
pub repository: Option<String>,
pub trustpub_data: Option<TrustpubData>,
pub linecounts: Option<serde_json::Value>,
}

impl Version {
Expand Down Expand Up @@ -109,6 +110,7 @@ pub struct NewVersion<'a> {
categories: Option<&'a [&'a str]>,
keywords: Option<&'a [&'a str]>,
trustpub_data: Option<&'a TrustpubData>,
linecounts: Option<serde_json::Value>,
}

impl NewVersion<'_> {
Expand Down
2 changes: 2 additions & 0 deletions crates/crates_io_database/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,8 @@ diesel::table! {
semver_ord -> Nullable<Jsonb>,
/// JSONB data containing JWT claims from the trusted publisher (e.g., GitHub Actions context like repository, run_id, sha)
trustpub_data -> Nullable<Jsonb>,
/// Source Lines of Code statistics for this version, stored as JSON with language breakdown and totals.
linecounts -> Nullable<Jsonb>,
}
}

Expand Down
2 changes: 2 additions & 0 deletions crates/crates_io_database_dump/src/dump-db.toml
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,8 @@ categories = "public"
keywords = "public"
# The following column is private for now, until we can guarantee a stable data schema.
trustpub_data = "private"
# The following column is private for now, until we can guarantee a stable data schema.
linecounts = "private"

[versions_published_by.columns]
version_id = "private"
Expand Down
17 changes: 17 additions & 0 deletions crates/crates_io_linecount/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[package]
name = "crates_io_linecount"
version = "0.0.0"
description = "Lines of code counting for crates.io using tokei"
license = "MIT OR Apache-2.0"
edition = "2024"

[lints]
workspace = true

[dependencies]
serde = { version = "=1.0.223", features = ["derive"] }
tokei = "=13.0.0-alpha.9"

[dev-dependencies]
claims = "=0.8.0"
insta = { version = "=1.43.2", features = ["json"] }
62 changes: 62 additions & 0 deletions crates/crates_io_linecount/src/languages.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
use tokei::LanguageType;

/// Determine if a language should be counted or ignored
pub fn should_ignore_language(lang: LanguageType) -> bool {
matches!(
lang,
// Configuration and data files
LanguageType::Json |
LanguageType::Yaml |
LanguageType::Toml |
LanguageType::Xml |
LanguageType::Ini |

// Documentation
LanguageType::Markdown |
LanguageType::Text |
LanguageType::ReStructuredText |
LanguageType::AsciiDoc |
LanguageType::Org |

// Build system files
LanguageType::Makefile |
LanguageType::CMake |
LanguageType::Dockerfile |
LanguageType::Autoconf |
LanguageType::MsBuild |
LanguageType::Meson |
LanguageType::Scons |
LanguageType::Bazel |
LanguageType::Nix |

// Shell scripts (debatable, but often just build/deploy automation)
LanguageType::Batch |
LanguageType::PowerShell |

// Other non-programming files
LanguageType::Svg |
LanguageType::Hex |
LanguageType::Protobuf |
LanguageType::Thrift
)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_should_ignore_language() {
// Should count programming languages
assert!(!should_ignore_language(LanguageType::Rust));
assert!(!should_ignore_language(LanguageType::JavaScript));
assert!(!should_ignore_language(LanguageType::Html));
assert!(!should_ignore_language(LanguageType::Css));

// Should skip config/data files
assert!(should_ignore_language(LanguageType::Json));
assert!(should_ignore_language(LanguageType::Yaml));
assert!(should_ignore_language(LanguageType::Toml));
assert!(should_ignore_language(LanguageType::Markdown));
}
}
143 changes: 143 additions & 0 deletions crates/crates_io_linecount/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
mod languages;
mod paths;

use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::LazyLock;
use tokei::Config;

pub use crate::paths::PathDetails;

// Re-export LanguageType for use by other crates
pub use tokei::LanguageType;

/// Tokei configuration used for analysis (cached)
static TOKEI_CONFIG: LazyLock<Config> = LazyLock::new(|| Config {
no_ignore: Some(true),
treat_doc_strings_as_comments: Some(true),
..Default::default()
});

/// Statistics for a single programming language
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct LanguageStats {
/// Number of lines of code (excluding comments and blank lines)
pub code_lines: usize,
/// Number of comment lines
pub comment_lines: usize,
/// Number of files of this language
pub files: usize,
}

/// Complete line count statistics for a crate
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct LinecountStats {
/// Per-language breakdown of line counts
pub languages: HashMap<LanguageType, LanguageStats>,
/// Total lines of code across all languages
pub total_code_lines: usize,
/// Total comment lines across all languages
pub total_comment_lines: usize,
}

impl LinecountStats {
/// Create a new empty statistics collection
pub fn new() -> Self {
Self::default()
}

/// Add a single file to the statistics
///
/// The caller can use `should_count_path()` to check if a file should be processed
/// before decompressing to avoid unnecessary work.
pub fn add_file(&mut self, language_type: LanguageType, content: &[u8]) {
let file_stats = language_type.parse_from_slice(content, &TOKEI_CONFIG);

// Update language-specific stats
let entry = self.languages.entry(language_type).or_default();
entry.code_lines += file_stats.code;
entry.comment_lines += file_stats.comments;
entry.files += 1;

// Update totals
self.total_code_lines += file_stats.code;
self.total_comment_lines += file_stats.comments;
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::path::Path;

#[test]
fn test_empty() {
let stats = LinecountStats::new();
insta::assert_json_snapshot!(stats, @r#"
{
"languages": {},
"total_code_lines": 0,
"total_comment_lines": 0
}
"#);
}

#[test]
fn test_add_file() {
let mut stats = LinecountStats::new();

// Add a Rust file
let rust_code = b"// This is a comment\nfn main() {\n println!(\"Hello\");\n}";
stats.add_file(LanguageType::Rust, rust_code);

insta::assert_json_snapshot!(stats, @r#"
{
"languages": {
"Rust": {
"code_lines": 3,
"comment_lines": 1,
"files": 1
}
},
"total_code_lines": 3,
"total_comment_lines": 1
}
"#);
}

#[test]
fn test_workflow() {
let mut stats = LinecountStats::new();

let files = [
("src/lib.rs", "pub fn hello() {}"),
("tests/test.rs", "fn test() {}"), // Should be skipped
("README.md", "# Hello"), // Should be skipped
];

for (path, content) in files {
let path = Path::new(path);
let path_details = PathDetails::from_path(path);

if !path_details.should_ignore()
&& let Some(language_type) = path_details.language_type()
{
stats.add_file(language_type, content.as_bytes())
};
}

insta::assert_json_snapshot!(stats, @r#"
{
"languages": {
"Rust": {
"code_lines": 1,
"comment_lines": 0,
"files": 1
}
},
"total_code_lines": 1,
"total_comment_lines": 0
}
"#);
}
}
74 changes: 74 additions & 0 deletions crates/crates_io_linecount/src/paths.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
use crate::languages::should_ignore_language;
use std::path::Path;
use tokei::LanguageType;

#[derive(Debug, Clone, Copy)]
pub struct PathDetails {
is_benchmark: bool,
is_example: bool,
is_hidden: bool,
is_test: bool,
language_type: Option<LanguageType>,
}

impl PathDetails {
pub fn from_path(path: &Path) -> Self {
let path_str = path.to_string_lossy().to_lowercase();

let is_benchmark = path_str.contains("benches/") || path_str.contains("benchmark/");
let is_example = path_str.contains("examples/");
let is_test = path_str.contains("tests/")
|| path_str.contains("test/")
|| path_str.contains("testing/");

let is_hidden = path
.file_name()
.map(|filename| filename.to_string_lossy().starts_with('.'))
.unwrap_or(false);

let language_type = path
.extension()
.and_then(|ext| ext.to_str())
.and_then(LanguageType::from_file_extension);

Self {
is_benchmark,
is_example,
is_hidden,
is_test,
language_type,
}
}

/// Determine if the file should be ignored for line counting purposes
/// because it is a benchmark, example, hidden, or test file.
pub fn should_ignore(&self) -> bool {
self.is_benchmark || self.is_example || self.is_hidden || self.is_test
}

/// Get the actual detected language type, even if it should be ignored.
pub fn actual_language_type(&self) -> Option<LanguageType> {
self.language_type
}

/// Get the detected language type, returning `None` if no language was
/// detected or if the language should be ignored (e.g., data files).
pub fn language_type(&self) -> Option<LanguageType> {
self.language_type.filter(|lt| !should_ignore_language(*lt))
}
}

#[cfg(test)]
mod tests {
use super::*;
use insta::assert_debug_snapshot;

#[test]
fn test_should_count_path() {
assert_debug_snapshot!(PathDetails::from_path(Path::new("src/tests/mod.rs")));
assert_debug_snapshot!(PathDetails::from_path(Path::new("tests/integration.rs")));
assert_debug_snapshot!(PathDetails::from_path(Path::new("examples/basic.rs")));
assert_debug_snapshot!(PathDetails::from_path(Path::new("benches/bench.rs")));
assert_debug_snapshot!(PathDetails::from_path(Path::new("src/lib.rs")));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
source: crates/crates_io_linecount/src/paths.rs
expression: "PathDetails::from_path(Path::new(\"tests/integration.rs\"))"
---
PathDetails {
is_benchmark: false,
is_example: false,
is_hidden: false,
is_test: true,
language_type: Some(
Rust,
),
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
source: crates/crates_io_linecount/src/paths.rs
expression: "PathDetails::from_path(Path::new(\"examples/basic.rs\"))"
---
PathDetails {
is_benchmark: false,
is_example: true,
is_hidden: false,
is_test: false,
language_type: Some(
Rust,
),
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
source: crates/crates_io_linecount/src/paths.rs
expression: "PathDetails::from_path(Path::new(\"benches/bench.rs\"))"
---
PathDetails {
is_benchmark: true,
is_example: false,
is_hidden: false,
is_test: false,
language_type: Some(
Rust,
),
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
source: crates/crates_io_linecount/src/paths.rs
expression: "PathDetails::from_path(Path::new(\"src/lib.rs\"))"
---
PathDetails {
is_benchmark: false,
is_example: false,
is_hidden: false,
is_test: false,
language_type: Some(
Rust,
),
}
Loading