diff --git a/DocsScraper/Project.toml b/DocsScraper/Project.toml new file mode 100644 index 0000000..7112c7b --- /dev/null +++ b/DocsScraper/Project.toml @@ -0,0 +1,16 @@ +name = "DocsScraper" +uuid = "6a596b1c-bad6-44d4-a29c-1a7b4368ba96" +authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"] +version = "0.1.0" + +[deps] +AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" + +[compat] +AbstractTrees = "0.4.5" +Gumbo = "0.8.2" +HTTP = "1.10.4" +URIs = "1.5.1" diff --git a/DocsScraper/README.md b/DocsScraper/README.md new file mode 100644 index 0000000..9393941 --- /dev/null +++ b/DocsScraper/README.md @@ -0,0 +1,24 @@ +# DocsScraper: "A document scraping and parsing tool used to create a custom RAG database for AIHelpMe.jl" + +This tool is used to collect and parse Julia's extensive documentation. + +## Requirements + +```julia +using Pkg +Pkg.add("HTTP") +Pkg.add("Gumbo") +Pkg.add("AbstractTrees") +Pkg.add("URIs") +``` + +## Usage +1. **Basic Usage**: +```julia + parsed_text = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/") +``` + +## How it works +```parse_url(url::String)``` extracts the base URL and recursively parses the URL so that all the inner lying text and code is returned in the form of a Vector of Dict along with each text/code's metadata. + +Please note that this is merely a pre-release and more work needs to be done \ No newline at end of file diff --git a/DocsScraper/src/DocsScraper.jl b/DocsScraper/src/DocsScraper.jl new file mode 100644 index 0000000..36a1254 --- /dev/null +++ b/DocsScraper/src/DocsScraper.jl @@ -0,0 +1 @@ +include("parser.jl") diff --git a/DocsScraper/src/parser.jl b/DocsScraper/src/parser.jl new file mode 100644 index 0000000..d1fbb49 --- /dev/null +++ b/DocsScraper/src/parser.jl @@ -0,0 +1,429 @@ +""" +Working: + +Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks +ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion. +For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks. +If the current node is a code block, return the text inside code block with backticks. +If the node is neither heading nor code, then we'll need to go deeper in the hierarchy. +if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td] +it is assumed that everything inside the tag is part of a single text block with inline code. +But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false. +To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration +that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again. +We indicate this by a return flag is_text_inserted +""" + + + +""" + insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + text_to_insert::AbstractString, + text_type::AbstractString) + +Insert the text into parsed_blocks Vector + +# Arguments +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +- text_to_insert: Text to be inserted +- text_type: The text to be inserted could be heading or a code block or just text +""" +function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + text_to_insert::AbstractString, + text_type::AbstractString) + + if !isempty(strip(text_to_insert)) + push!(parsed_blocks, + Dict(text_type => strip(text_to_insert), + "metadata" => copy(heading_hierarchy))) + end +end + + + +""" + process_headings!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}) + +Process headings. If the current node is heading, directly insert into parsed_blocks. + +# Arguments +- node: The root HTML node +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +""" +function process_headings!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}) + + tag_name = Gumbo.tag(node) + # Clear headings of equal or lower level + for k in collect(keys(heading_hierarchy)) + if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name))) + delete!(heading_hierarchy, k) + end + end + heading_hierarchy[tag_name] = strip(Gumbo.text(node)) + insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(node), "heading") + + is_code_block = false + is_text_inserted = false + return "", is_code_block, is_text_inserted +end + +""" + process_code(node::Gumbo.HTMLElement) + +Process code snippets. If the current node is a code block, return the text inside code block with backticks. + +# Arguments +- node: The root HTML node +""" +function process_code(node::Gumbo.HTMLElement) + is_code_block = false + + # Start a new code block + if Gumbo.tag(node.parent) == :pre + class_name = getattr(node, "class", "") + if occursin("language", class_name) + match_result = match(r"language-(\S+)", class_name) + language = match_result !== nothing ? match_result.captures[1] : "julia" + code_content = "```$language " * strip(Gumbo.text(node)) * "```" + else + code_content = "```julia " * strip(Gumbo.text(node)) * "```" + end + is_code_block = true + else + code_content = "`" * strip(Gumbo.text(node)) * "`" + end + is_text_inserted = false + return code_content, is_code_block, is_text_inserted +end + +""" + process_generic_node!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + +If the node is neither heading nor code + + +# Arguments +- node: The root HTML node +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +- child_new: Bool to specify if the current block (child) is part of previous block or not. + If it's not, then a new insertion needs to be created in parsed_blocks +- prev_text_buffer: IO Buffer which contains previous text +""" +function process_generic_node!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + + seekstart(prev_text_buffer) + prev_text = read(prev_text_buffer, String) + + tag_name = Gumbo.tag(node) + text_to_insert = "" + # Recursively process the child node for text content + children = collect(AbstractTrees.children(node)) + num_children = length(children) + is_code_block = false + is_text_inserted = false + for (index, child) in enumerate(children) + # if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless, + # there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted. + + if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header] + received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer) + else + received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + end + + # changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call) + if is_text_inserted + text_to_insert = "" + prev_text = "" + take!(prev_text_buffer) + end + + # if is_code_block is true, means the received_text is a code block, hence needs to be put as a separate entry in parsed_blocks + if !isempty(strip(received_text)) && is_code_block == true + to_insert = String(take!(prev_text_buffer)) + if (!isempty(strip(to_insert))) + insert_parsed_data!(heading_hierarchy, parsed_blocks, to_insert, "text") + text_to_insert = "" + prev_text = "" + is_text_inserted = true + end + insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "code") + is_code_block = false + received_text = "" + end + + # if the code block is last part of the loop then due to is_code_block::bool, whole text_to_insert becomes code + # (code is returned with backticks. If code is not inline and is supposed to be a separate block, + # then this case is handled earlier where size of code>50 ) + if index == num_children + is_code_block = false + end + + if !isempty(strip(received_text)) + print(prev_text_buffer, " " * received_text) + text_to_insert = text_to_insert * " " * received_text + end + + end + + # if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence, + # reset the buffer return the text. + if (child_new == false) + take!(prev_text_buffer) + print(prev_text_buffer, prev_text) + return text_to_insert, is_code_block, is_text_inserted + end + + # insert text_to_insert to parsed_blocks + # if we're insert text in current node level, then we should insert the previous text if available, + # otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird + if !isempty(strip(text_to_insert)) + insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text") + is_text_inserted = true + end + + # following is so that in current recursive call, we appended prev_text_buffer, which we need to remove + take!(prev_text_buffer) + print(prev_text_buffer, prev_text) + return "", is_code_block, is_text_inserted +end + + +""" + process_docstring!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + +Function to process node of class `docstring` + +# Arguments +- node: The root HTML node +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +- child_new: Bool to specify if the current block (child) is part of previous block or not. + If it's not, then a new insertion needs to be created in parsed_blocks +- prev_text_buffer: IO Buffer which contains previous text +""" +function process_docstring!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + + seekstart(prev_text_buffer) + prev_text = read(prev_text_buffer, String) + is_code_block = false + is_text_inserted = false + + # Recursively process the child node for text content + children = collect(AbstractTrees.children(node)) + + # Insert previously collected text + to_insert = String(take!(prev_text_buffer)) + if (!isempty(strip(to_insert))) + insert_parsed_data!(heading_hierarchy, parsed_blocks, to_insert, "text") + prev_text = "" + is_text_inserted = true + end + + # Insert "header" + if Gumbo.tag(children[1]) == :header + heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1])) + insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header") + end + + received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + + if !isempty(strip(received_text)) + insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text") + end + delete!(heading_hierarchy, :docstring_header) + + return "", is_code_block, is_text_inserted +end + +""" + process_node!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + +Function to process a node + +# Arguments +- node: The root HTML node +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +- child_new: Bool to specify if the current block (child) is part of previous block or not. + If it's not, then a new insertion needs to be created in parsed_blocks +- prev_text_buffer: IO Buffer which contains previous text +""" +function process_node!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + + tag_name = Gumbo.tag(node) + if startswith(string(tag_name), "h") && isdigit(last(string(tag_name))) + return process_headings!(node, heading_hierarchy, parsed_blocks) + + elseif tag_name == :code + return process_code(node) + + elseif tag_name == :article && getattr(node, "class", "") == "docstring" + return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + + end + + return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + +end + + +""" +multiple dispatch for process_node!() when node is of type Gumbo.HTMLText +""" +function process_node!(node::Gumbo.HTMLText, args...) + is_code_block = false + is_text_inserted = false + return strip(Gumbo.text(node)), is_code_block, is_text_inserted +end + + +""" + get_base_url(url::AbstractString) + +Extracts the base url. + +# Arguments +- `url`: The url string of which, the base url needs to be extracted +""" +function get_base_url(url::AbstractString) + parsed_url = URIs.URI(url) + base_url = string(parsed_url.scheme, "://", parsed_url.host, + parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path) + return base_url +end + +""" + get_html_content(root::Gumbo.HTMLElement) + +Returns the main content of the HTML. If not found, returns the whole HTML to parse + +# Arguments +- `root`: The HTML root from which content is extracted +""" +function get_html_content(root::Gumbo.HTMLElement) + target_ids = Set(["VPContent", "main_content_wrap", "pages-content"]) + target_classes = Set(["content", "franklin-content"]) + + content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement] + + # First try to find by ID + content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates) + if !isempty(content_by_id) + return only(content_by_id) + end + + # Fallback to class if no ID matches + content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates) + if !isempty(content_by_class) + return only(content_by_class) + end + + # Fallback to the root node if no class matches + return root + +end + + +""" + parse_url(urls::Vector{<:AbstractString}) + +Initiator and main function to parse HTML from url + +# Arguments +- `urls`: vector containing URL strings to parse + +# Returns +- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata + +# Usage +parsed_blocks = parse_url(["https://docs.julialang.org/en/v1/base/multi-threading/"]) + +# Example +Let the HTML be: + + +
+ +para 1
+this is my code block
+ This is a paragraph with inline code
para ewg
+ + + + +Output: +Any[ + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3") + Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2")) + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with inline code") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg") +] +""" +function parse_url_to_blocks(urls::Vector{<:AbstractString}) + + ## TODO: Check if you need parallel processing for multiple urls + + parsed_blocks = Vector{Dict{String,Any}}() + heading_hierarchy = Dict{Symbol,Any}() + + for url in urls + @info "Parsing URL: $url" + base_url = get_base_url(url) + r = HTTP.get(base_url) + r_parsed = parsehtml(String(r.body)) + # Getting title of the document + # title = [el + # for el in AbstractTrees.PreOrderDFS(r_parsed.root) + # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ") + + + process_node!(get_html_content(r_parsed.root), heading_hierarchy, parsed_blocks) + end + return parsed_blocks +end diff --git a/DocsScraper/test/runtests.jl b/DocsScraper/test/runtests.jl new file mode 100644 index 0000000..a32528c --- /dev/null +++ b/DocsScraper/test/runtests.jl @@ -0,0 +1,7 @@ +using Test + +include("..\\src\\DocsScraper.jl") + +@testset "DocsScraper Tests" begin + # Your test cases go here +end