From 8d2c661877a34b98208d0bc81311b7ad13c6a425 Mon Sep 17 00:00:00 2001 From: Nathan Smith Date: Sat, 11 Jan 2025 07:35:25 -0600 Subject: [PATCH 01/13] broadcast before joining --- .../bipartite/grouping.py | 5 +-- .../monopartite/batching.py | 4 +-- .../monopartite/grouping.py | 35 ++++++++++++------- .../predefined_components/grouping.py | 3 +- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/neo4j_parallel_spark_loader/bipartite/grouping.py b/neo4j_parallel_spark_loader/bipartite/grouping.py index 72b3141..9dfd296 100644 --- a/neo4j_parallel_spark_loader/bipartite/grouping.py +++ b/neo4j_parallel_spark_loader/bipartite/grouping.py @@ -1,4 +1,5 @@ from pyspark.sql import DataFrame +from pyspark.sql.functions import broadcast from ..utils.grouping import ( create_group_column_from_source_and_target_groups, @@ -60,12 +61,12 @@ def create_node_groupings( ) final_sdf = spark_dataframe.join( - other=source_groupings_sdf.withColumnRenamed("group", "source_group"), + other=broadcast(source_groupings_sdf.withColumnRenamed("group", "source_group")), on=(spark_dataframe[source_col] == source_groupings_sdf.value), how="left", ).drop(source_groupings_sdf.value) final_sdf = final_sdf.join( - other=target_groupings_sdf.withColumnRenamed("group", "target_group"), + other=broadcast(target_groupings_sdf.withColumnRenamed("group", "target_group")), on=(spark_dataframe[target_col] == target_groupings_sdf.value), how="left", ).drop(target_groupings_sdf.value) diff --git a/neo4j_parallel_spark_loader/monopartite/batching.py b/neo4j_parallel_spark_loader/monopartite/batching.py index ebb683d..955ac8e 100644 --- a/neo4j_parallel_spark_loader/monopartite/batching.py +++ b/neo4j_parallel_spark_loader/monopartite/batching.py @@ -1,7 +1,7 @@ from typing import Dict, Tuple from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.functions import col +from pyspark.sql.functions import broadcast def create_ingest_batches_from_groups(spark_dataframe: DataFrame) -> DataFrame: @@ -39,7 +39,7 @@ def create_ingest_batches_from_groups(spark_dataframe: DataFrame) -> DataFrame: # Join the DataFrames result_df = spark_dataframe.join( - other=coloring_df, + other=broadcast(coloring_df), on=(spark_dataframe.group == coloring_df.group), how="left", # Use left join to keep all records from spark_dataframe ).drop( diff --git a/neo4j_parallel_spark_loader/monopartite/grouping.py b/neo4j_parallel_spark_loader/monopartite/grouping.py index 59f44b2..5a26c5e 100644 --- a/neo4j_parallel_spark_loader/monopartite/grouping.py +++ b/neo4j_parallel_spark_loader/monopartite/grouping.py @@ -1,5 +1,5 @@ from pyspark.sql import DataFrame -from pyspark.sql.functions import col, concat, greatest, least, lit +from pyspark.sql.functions import broadcast, concat, greatest, least, lit from ..utils.grouping import ( create_value_groupings, @@ -50,18 +50,27 @@ def create_node_groupings( grouping_column="combined_col", ) - final_sdf = spark_dataframe.join( - other=keys_sdf.withColumnRenamed("group", "source_group"), - on=(spark_dataframe[source_col] == keys_sdf.value), - how="left", - ).drop(keys_sdf.value) - final_sdf = final_sdf.join( - other=keys_sdf.withColumnRenamed("group", "target_group"), - on=(spark_dataframe[target_col] == keys_sdf.value), - how="left", - ).drop(keys_sdf.value) - - final_sdf = final_sdf.drop("value") + # Broadcast keys_sdf once + broadcasted_keys = broadcast(keys_sdf) + + # Create two views of the same broadcasted DataFrame + source_keys = broadcasted_keys.withColumnRenamed("group", "source_group") + target_keys = broadcasted_keys.withColumnRenamed("group", "target_group") + + final_sdf = (spark_dataframe + .join( + other=source_keys, + on=(spark_dataframe[source_col] == source_keys.value), + how="left" + ) + .drop(source_keys.value) + .join( + other=target_keys, + on=(spark_dataframe[target_col] == target_keys.value), + how="left" + ) + .drop(target_keys.value) + .drop("value")) final_sdf = final_sdf.withColumn( "group", diff --git a/neo4j_parallel_spark_loader/predefined_components/grouping.py b/neo4j_parallel_spark_loader/predefined_components/grouping.py index 020d769..e332130 100644 --- a/neo4j_parallel_spark_loader/predefined_components/grouping.py +++ b/neo4j_parallel_spark_loader/predefined_components/grouping.py @@ -1,4 +1,5 @@ from pyspark.sql import DataFrame +from pyspark.sql.functions import broadcast from ..utils.grouping import create_value_counts_dataframe, create_value_groupings from ..utils.verify_spark import verify_spark_version @@ -44,7 +45,7 @@ def create_node_groupings( ) final_sdf = spark_dataframe.join( - other=value_groupings_sdf, + other=broadcast(value_groupings_sdf), on=(spark_dataframe[partition_col] == value_groupings_sdf.value), how="left", ).drop(value_groupings_sdf.value) From 1b56c499d5144f4e5c8926f28f7a286b36508e28 Mon Sep 17 00:00:00 2001 From: Nathan Smith Date: Sat, 11 Jan 2025 07:53:54 -0600 Subject: [PATCH 02/13] Rely on autobroadcasting for grouping tables. --- .../bipartite/grouping.py | 5 ++--- .../monopartite/grouping.py | 21 +++++++------------ .../predefined_components/grouping.py | 3 +-- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/neo4j_parallel_spark_loader/bipartite/grouping.py b/neo4j_parallel_spark_loader/bipartite/grouping.py index 9dfd296..72b3141 100644 --- a/neo4j_parallel_spark_loader/bipartite/grouping.py +++ b/neo4j_parallel_spark_loader/bipartite/grouping.py @@ -1,5 +1,4 @@ from pyspark.sql import DataFrame -from pyspark.sql.functions import broadcast from ..utils.grouping import ( create_group_column_from_source_and_target_groups, @@ -61,12 +60,12 @@ def create_node_groupings( ) final_sdf = spark_dataframe.join( - other=broadcast(source_groupings_sdf.withColumnRenamed("group", "source_group")), + other=source_groupings_sdf.withColumnRenamed("group", "source_group"), on=(spark_dataframe[source_col] == source_groupings_sdf.value), how="left", ).drop(source_groupings_sdf.value) final_sdf = final_sdf.join( - other=broadcast(target_groupings_sdf.withColumnRenamed("group", "target_group")), + other=target_groupings_sdf.withColumnRenamed("group", "target_group"), on=(spark_dataframe[target_col] == target_groupings_sdf.value), how="left", ).drop(target_groupings_sdf.value) diff --git a/neo4j_parallel_spark_loader/monopartite/grouping.py b/neo4j_parallel_spark_loader/monopartite/grouping.py index 5a26c5e..fce2266 100644 --- a/neo4j_parallel_spark_loader/monopartite/grouping.py +++ b/neo4j_parallel_spark_loader/monopartite/grouping.py @@ -1,5 +1,5 @@ from pyspark.sql import DataFrame -from pyspark.sql.functions import broadcast, concat, greatest, least, lit +from pyspark.sql.functions import concat, greatest, least, lit from ..utils.grouping import ( create_value_groupings, @@ -50,26 +50,19 @@ def create_node_groupings( grouping_column="combined_col", ) - # Broadcast keys_sdf once - broadcasted_keys = broadcast(keys_sdf) - - # Create two views of the same broadcasted DataFrame - source_keys = broadcasted_keys.withColumnRenamed("group", "source_group") - target_keys = broadcasted_keys.withColumnRenamed("group", "target_group") - final_sdf = (spark_dataframe .join( - other=source_keys, - on=(spark_dataframe[source_col] == source_keys.value), + other=keys_sdf.withColumnRenamed("group", "source_group"), + on=(spark_dataframe[source_col] == keys_sdf.value), how="left" ) - .drop(source_keys.value) + .drop(keys_sdf.value) .join( - other=target_keys, - on=(spark_dataframe[target_col] == target_keys.value), + other=keys_sdf.withColumnRenamed("group", "source_group"), + on=(spark_dataframe[target_col] == keys_sdf.value), how="left" ) - .drop(target_keys.value) + .drop(keys_sdf.value) .drop("value")) final_sdf = final_sdf.withColumn( diff --git a/neo4j_parallel_spark_loader/predefined_components/grouping.py b/neo4j_parallel_spark_loader/predefined_components/grouping.py index e332130..020d769 100644 --- a/neo4j_parallel_spark_loader/predefined_components/grouping.py +++ b/neo4j_parallel_spark_loader/predefined_components/grouping.py @@ -1,5 +1,4 @@ from pyspark.sql import DataFrame -from pyspark.sql.functions import broadcast from ..utils.grouping import create_value_counts_dataframe, create_value_groupings from ..utils.verify_spark import verify_spark_version @@ -45,7 +44,7 @@ def create_node_groupings( ) final_sdf = spark_dataframe.join( - other=broadcast(value_groupings_sdf), + other=value_groupings_sdf, on=(spark_dataframe[partition_col] == value_groupings_sdf.value), how="left", ).drop(value_groupings_sdf.value) From ea93bca7b6ebd7c8568618c3f2a04ff5679c9988 Mon Sep 17 00:00:00 2001 From: Nathan Smith Date: Sat, 11 Jan 2025 07:55:13 -0600 Subject: [PATCH 03/13] Count groups by batch --- neo4j_parallel_spark_loader/utils/ingest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/neo4j_parallel_spark_loader/utils/ingest.py b/neo4j_parallel_spark_loader/utils/ingest.py index 179463a..991d27a 100644 --- a/neo4j_parallel_spark_loader/utils/ingest.py +++ b/neo4j_parallel_spark_loader/utils/ingest.py @@ -67,10 +67,9 @@ def ingest_spark_dataframe( for batch_value in batch_list ] - num_groups = spark_dataframe.select("group").distinct().count() - # write batches serially to Neo4j database for batch in batches: + num_groups = batch.select("group").distinct().count() ( batch.repartition(num_groups, "group") # define parallel groups for ingest .write.mode(save_mode) From 30471bcddde99f26ca6c80f4d49da32f6853a2d1 Mon Sep 17 00:00:00 2001 From: Nathan Smith Date: Sat, 11 Jan 2025 09:31:18 -0600 Subject: [PATCH 04/13] Add twitch_gamers example --- examples/twitch_gamers.ipynb | 876 +++++++++++++++++++++++++++++++++++ 1 file changed, 876 insertions(+) create mode 100644 examples/twitch_gamers.ipynb diff --git a/examples/twitch_gamers.ipynb b/examples/twitch_gamers.ipynb new file mode 100644 index 0000000..1531eb5 --- /dev/null +++ b/examples/twitch_gamers.ipynb @@ -0,0 +1,876 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T14:49:41.2174665Z", + "execution_start_time": "2025-01-11T14:49:40.6965287Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "20a7c7a6-49c7-4b5a-9f32-5b15f0e51019", + "queued_time": "2025-01-11T14:48:11.9992599Z", + "session_id": "18", + "session_start_time": "2025-01-11T14:48:12.0369851Z", + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 2, + "statement_ids": [ + 2 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 2, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "import requests\n", + "from io import BytesIO\n", + "from zipfile import ZipFile\n", + "from neo4j_parallel_spark_loader.monopartite import group_and_batch_spark_dataframe\n", + "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Create spark session" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T14:49:41.5117197Z", + "execution_start_time": "2025-01-11T14:49:41.3585458Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "769d32ac-fec7-4674-b91c-561e4424b35a", + "queued_time": "2025-01-11T14:48:28.1869292Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 3, + "statement_ids": [ + 3 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 3, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "username = \"NEO4J_USER_NAME\"\n", + "password = \"NEO4J_PASSWORD\"\n", + "url = \"NEO4J_URL\"\n", + "dbname = \"NEO4J_DB\"\n", + "spark_executor_count = 5\n", + "\n", + "\n", + "spark = (\n", + " SparkSession.builder\n", + " .appName(\"TwitchGamers\")\n", + " .config(\"neo4j.url\", url)\n", + " .config(\"url\", url)\n", + " .config(\"neo4j.authentication.basic.username\", username)\n", + " .config(\"neo4j.authentication.basic.password\", password)\n", + " .config(\"neo4j.database\", dbname)\n", + " .getOrCreate()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Download data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T14:50:23.0615084Z", + "execution_start_time": "2025-01-11T14:49:55.5797104Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "2d565aae-42f2-4029-a551-702623320475", + "queued_time": "2025-01-11T14:49:55.4616629Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 4, + "statement_ids": [ + 4 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 4, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+------------+\n", + "|numeric_id_1|numeric_id_2|\n", + "+------------+------------+\n", + "| 98343| 141493|\n", + "| 98343| 58736|\n", + "| 98343| 140703|\n", + "| 98343| 151401|\n", + "| 98343| 157118|\n", + "| 98343| 125430|\n", + "| 98343| 3635|\n", + "| 98343| 495|\n", + "| 98343| 116648|\n", + "| 98343| 1679|\n", + "| 98343| 123861|\n", + "| 98343| 89631|\n", + "| 98343| 113417|\n", + "| 98343| 145281|\n", + "| 98343| 10408|\n", + "| 98343| 3181|\n", + "| 98343| 40675|\n", + "| 98343| 95914|\n", + "| 98343| 155127|\n", + "| 98343| 124827|\n", + "+------------+------------+\n", + "only showing top 20 rows\n", + "\n", + "root\n", + " |-- numeric_id_1: string (nullable = true)\n", + " |-- numeric_id_2: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "# Download the ZIP file\n", + "response = requests.get(\"https://snap.stanford.edu/data/twitch_gamers.zip\")\n", + "zip_file = ZipFile(BytesIO(response.content))\n", + "\n", + "# Read the CSV file directly from the ZIP\n", + "with zip_file.open(\"large_twitch_edges.csv\") as file:\n", + " # Convert to string buffer for Spark to read\n", + " content = file.read().decode('utf-8')\n", + " \n", + " # Create RDD from content\n", + " rdd = spark.sparkContext.parallelize(content.splitlines())\n", + " \n", + " # Convert RDD to DataFrame\n", + " twitch_df = spark.read.csv(rdd, header=True)\n", + "\n", + "# Now df is your Spark DataFrame containing the data\n", + "# You can verify the data\n", + "twitch_df.show()\n", + "twitch_df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T14:50:40.3463803Z", + "execution_start_time": "2025-01-11T14:50:31.2930721Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "ec3dba3e-335f-4a2b-8f13-7f034153d961", + "queued_time": "2025-01-11T14:50:31.1556519Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 6, + "statement_ids": [ + 6 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 6, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "6797557" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "twitch_df.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T14:51:48.9139321Z", + "execution_start_time": "2025-01-11T14:51:42.0394455Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "5886e1ba-297a-42b5-8d17-7d141e9a261f", + "queued_time": "2025-01-11T14:51:41.8997493Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 9, + "statement_ids": [ + 9 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 9, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "168114" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "node_df = (twitch_df\n", + " .select('numeric_id_1')\n", + " .withColumnRenamed('numeric_id_1', 'nodeId')\n", + " .union(\n", + " twitch_df\n", + " .select('numeric_id_2')\n", + " .withColumnRenamed('numeric_id_2', 'nodeId'))\n", + " .dropDuplicates())\n", + "node_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T14:52:25.8382593Z", + "execution_start_time": "2025-01-11T14:52:15.2936378Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "00844db7-d433-41f6-bb3a-6d30eed6a6d0", + "queued_time": "2025-01-11T14:52:15.1638534Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 11, + "statement_ids": [ + 11 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 11, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(\n", + " node_df.write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"labels\", \":Node\")\n", + " .option(\"node.keys\", \"nodeId:id\")\n", + " .save()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load rels" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T14:55:23.0498345Z", + "execution_start_time": "2025-01-11T14:55:00.1139093Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "93db93ce-ba8c-4141-9062-ba6a9c681008", + "queued_time": "2025-01-11T14:55:00.0018896Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 14, + "statement_ids": [ + 14 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 14, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "rel_batch_df = group_and_batch_spark_dataframe(spark_dataframe=twitch_df, source_col='numeric_id_1', target_col='numeric_id_2', num_groups=(spark_executor_count * 2) - 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T14:56:52.1245437Z", + "execution_start_time": "2025-01-11T14:56:48.2089393Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "6afd2da0-a917-4ac5-b24e-c0f609582ca7", + "queued_time": "2025-01-11T14:56:48.0701321Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 15, + "statement_ids": [ + 15 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 15, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+------------+------+-----+\n", + "|numeric_id_1|numeric_id_2| group|batch|\n", + "+------------+------------+------+-----+\n", + "| 111207| 159752|1 -- 5| 3|\n", + "| 111207| 145773|1 -- 5| 3|\n", + "| 111207| 42059|1 -- 6| 8|\n", + "| 111207| 29852|1 -- 7| 4|\n", + "| 111207| 75443|1 -- 1| 1|\n", + "| 111207| 12216|1 -- 2| 6|\n", + "| 111207| 123168|0 -- 1| 5|\n", + "| 111207| 40798|1 -- 1| 1|\n", + "| 111207| 52070|0 -- 1| 5|\n", + "| 111207| 118432|1 -- 3| 2|\n", + "| 111207| 140508|1 -- 2| 6|\n", + "| 111207| 78550|1 -- 8| 0|\n", + "| 111207| 76995|1 -- 2| 6|\n", + "| 111207| 73856|1 -- 3| 2|\n", + "| 111207| 154308|1 -- 2| 6|\n", + "| 111207| 129693|0 -- 1| 5|\n", + "| 111207| 63938|1 -- 1| 1|\n", + "| 111207| 21381|1 -- 4| 7|\n", + "| 111207| 28488|1 -- 2| 6|\n", + "| 111207| 13564|1 -- 1| 1|\n", + "+------------+------------+------+-----+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "rel_batch_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T15:01:54.9500155Z", + "execution_start_time": "2025-01-11T14:57:24.5030447Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "ecdaef15-dc48-48f2-b23b-b53daa35827f", + "queued_time": "2025-01-11T14:57:24.3876952Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 16, + "statement_ids": [ + 16 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 16, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "query = \"\"\"\n", + " MATCH(source:Node {id: event.numeric_id_1})\n", + " MATCH(target:Node {id: event.numeric_id_2})\n", + " MERGE(source)-[r:RELATES_TO]->(target)\n", + " \"\"\"\n", + "\n", + "ingest_spark_dataframe(\n", + " spark_dataframe=rel_batch_df,\n", + " save_mode= \"Overwrite\",\n", + " options={\"query\":query}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Delete rels" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T15:03:46.0902216Z", + "execution_start_time": "2025-01-11T15:03:43.2960105Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "f7cb9b0e-b957-4dae-93d6-0c93f3846fe5", + "queued_time": "2025-01-11T15:03:43.1740342Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 20, + "statement_ids": [ + 20 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 20, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6797557 680\n" + ] + } + ], + "source": [ + "rel_count = twitch_df.count()\n", + "batch_count = rel_count // 10000 + 1\n", + "print(rel_count, batch_count)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T15:09:07.318409Z", + "execution_start_time": "2025-01-11T15:09:06.8032688Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "be93cdfa-06c6-4d6e-bc5a-0146f77d1cd1", + "queued_time": "2025-01-11T15:09:06.6770525Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 25, + "statement_ids": [ + 25 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 25, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "680\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import lit\n", + "del_df = (spark.range(batch_count)\n", + " .select(lit(1).alias(\"id\")))\n", + "print(del_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T15:16:01.4072209Z", + "execution_start_time": "2025-01-11T15:10:19.9712186Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "457a965b-b574-4751-85c2-dc2b43e2c3a2", + "queued_time": "2025-01-11T15:10:19.851916Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 27, + "statement_ids": [ + 27 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 27, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "del_query = \"MATCH ()-[r:RELATES_TO]->() WITH r LIMIT 10000 DELETE r\"\n", + "\n", + "(\n", + " del_df.coalesce(1).write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"query\", del_query)\n", + " .option(\"batch.size\", 1)\n", + " .save()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load rels serially" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": null, + "execution_start_time": "2025-01-11T15:22:59.3132605Z", + "livy_statement_state": "running", + "normalized_state": "running", + "parent_msg_id": "d14c3ab9-8c1b-4d65-a66e-ba784a7c620e", + "queued_time": "2025-01-11T15:22:59.1886902Z", + "session_id": "18", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "submitted", + "statement_id": 28, + "statement_ids": [ + 28 + ] + }, + "text/plain": [ + "StatementMeta(medium, 18, 28, Submitted, Running, Running)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(\n", + " twitch_df.coalesce(1).write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"query\", query)\n", + " .save()\n", + ")" + ] + } + ], + "metadata": { + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Synapse PySpark", + "language": "Python", + "name": "synapse_pyspark" + }, + "language_info": { + "name": "python" + }, + "save_output": true, + "synapse_widget": { + "state": {}, + "version": "0.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 821c680b76204c2c7c2638e36cfcef92128aa02b Mon Sep 17 00:00:00 2001 From: Nathan Smith Date: Sat, 11 Jan 2025 17:34:16 -0600 Subject: [PATCH 05/13] Correct column name. --- neo4j_parallel_spark_loader/monopartite/grouping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neo4j_parallel_spark_loader/monopartite/grouping.py b/neo4j_parallel_spark_loader/monopartite/grouping.py index fce2266..1051876 100644 --- a/neo4j_parallel_spark_loader/monopartite/grouping.py +++ b/neo4j_parallel_spark_loader/monopartite/grouping.py @@ -58,7 +58,7 @@ def create_node_groupings( ) .drop(keys_sdf.value) .join( - other=keys_sdf.withColumnRenamed("group", "source_group"), + other=keys_sdf.withColumnRenamed("group", "target_group"), on=(spark_dataframe[target_col] == keys_sdf.value), how="left" ) From e7f17d5268edf2906b7a105c811a5e233edf0506 Mon Sep 17 00:00:00 2001 From: Nathan Smith Date: Sat, 11 Jan 2025 17:43:57 -0600 Subject: [PATCH 06/13] Update examples --- examples/bipartite_amazon_ratings.ipynb | 887 ++++++++++++++++++ ....ipynb => monopartite_twitch_gamers.ipynb} | 33 +- ...predefined_components_reddit_threads.ipynb | 883 +++++++++++++++++ 3 files changed, 1787 insertions(+), 16 deletions(-) create mode 100644 examples/bipartite_amazon_ratings.ipynb rename examples/{twitch_gamers.ipynb => monopartite_twitch_gamers.ipynb} (96%) create mode 100644 examples/predefined_components_reddit_threads.ipynb diff --git a/examples/bipartite_amazon_ratings.ipynb b/examples/bipartite_amazon_ratings.ipynb new file mode 100644 index 0000000..746c6df --- /dev/null +++ b/examples/bipartite_amazon_ratings.ipynb @@ -0,0 +1,887 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:03:23.0244336Z", + "execution_start_time": "2025-01-11T19:03:22.8627505Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "2b085336-8c6d-4adf-97f2-4132e4403687", + "queued_time": "2025-01-11T19:03:22.102208Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 7, + "statement_ids": [ + 7 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 7, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.types import StructType, StructField, IntegerType, FloatType\n", + "import requests\n", + "from io import BytesIO\n", + "from zipfile import ZipFile\n", + "from neo4j_parallel_spark_loader.bipartite import group_and_batch_spark_dataframe\n", + "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Create spark session" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T18:59:51.2054347Z", + "execution_start_time": "2025-01-11T18:59:51.0447282Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "46cabf82-8138-47c2-9336-203da9e26e57", + "queued_time": "2025-01-11T18:55:55.3501895Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 3, + "statement_ids": [ + 3 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 3, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "username = \"NEO4J_USER\"\n", + "password = \"NEO4J_PASSWORD\"\n", + "url = \"NEO4J_URL\"\n", + "dbname = \"NEO4J_DATABASE\"\n", + "spark_executor_count=5\n", + "\n", + "spark = (\n", + " SparkSession.builder\n", + " .appName(\"AmazonRatings\")\n", + " .config(\"neo4j.url\", url)\n", + " .config(\"url\", url)\n", + " .config(\"neo4j.authentication.basic.username\", username)\n", + " .config(\"neo4j.authentication.basic.password\", password)\n", + " .config(\"neo4j.database\", dbname)\n", + " .getOrCreate()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Download data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:03:31.160049Z", + "execution_start_time": "2025-01-11T19:03:25.850078Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "de533a22-47ec-4a9c-aaf8-f1c21fb818e5", + "queued_time": "2025-01-11T19:03:25.7041392Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 8, + "statement_ids": [ + 8 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 8, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+---------+------+----------+\n", + "|source_id|target_id|rating| timestamp|\n", + "+---------+---------+------+----------+\n", + "| 1| 1| 5.0|1117404000|\n", + "| 1| 2| 1.0|1105916400|\n", + "| 1| 3| 5.0|1105916400|\n", + "| 1| 4| 1.0|1105570800|\n", + "| 1| 5| 1.0|1104966000|\n", + "| 1| 6| 5.0|1103497200|\n", + "| 1| 7| 4.0|1081461600|\n", + "| 1| 8| 5.0|1074985200|\n", + "| 1| 9| 5.0|1071961200|\n", + "| 1| 10| 1.0|1071788400|\n", + "| 1| 11| 4.0|1071702000|\n", + "| 1| 12| 5.0|1070492400|\n", + "| 1| 13| 5.0|1070319600|\n", + "| 1| 14| 5.0|1066514400|\n", + "| 1| 15| 4.0|1066341600|\n", + "| 1| 16| 5.0|1066341600|\n", + "| 1| 17| 5.0|1066168800|\n", + "| 1| 18| 2.0|1065996000|\n", + "| 1| 19| 5.0|1065909600|\n", + "| 1| 20| 5.0|1065650400|\n", + "+---------+---------+------+----------+\n", + "only showing top 20 rows\n", + "\n", + "root\n", + " |-- source_id: integer (nullable = true)\n", + " |-- target_id: integer (nullable = true)\n", + " |-- rating: float (nullable = true)\n", + " |-- timestamp: integer (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "# Define the schema\n", + "schema = StructType([\n", + " StructField(\"source_id\", IntegerType(), True),\n", + " StructField(\"target_id\", IntegerType(), True),\n", + " StructField(\"rating\", FloatType(), True),\n", + " StructField(\"timestamp\", IntegerType(), True)\n", + "])\n", + "\n", + "# Download the ZIP file\n", + "response = requests.get(\"https://nrvis.com/download/data/dynamic/rec-amazon-ratings.zip\")\n", + "zip_file = ZipFile(BytesIO(response.content))\n", + "\n", + "# Read the CSV file directly from the ZIP\n", + "with zip_file.open(\"rec-amazon-ratings.edges\") as file:\n", + " # Convert to string buffer for Spark to read\n", + " content = file.read().decode('utf-8')\n", + " \n", + " # Create RDD from content\n", + " rdd = spark.sparkContext.parallelize(content.splitlines())\n", + " \n", + " # Convert RDD to DataFrame with schema\n", + " rating_df = spark.read.csv(rdd, schema=schema, header=False)\n", + "\n", + "# Now df is your Spark DataFrame containing the data with proper column names and types\n", + "# You can verify the data\n", + "rating_df.show()\n", + "rating_df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:03:43.4584278Z", + "execution_start_time": "2025-01-11T19:03:39.0646295Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "e387e6cb-4355-4128-baad-c2eccad4ca78", + "queued_time": "2025-01-11T19:03:38.9431924Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 9, + "statement_ids": [ + 9 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 9, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "5838041" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rating_df.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:13:20.0134295Z", + "execution_start_time": "2025-01-11T19:12:59.1775369Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "f6a2aa90-0280-43ba-9760-a3ebf360b3f1", + "queued_time": "2025-01-11T19:12:59.0570586Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 12, + "statement_ids": [ + 12 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 12, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(\n", + " rating_df\n", + " .select(\"source_id\")\n", + " .distinct()\n", + " .write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"labels\", \":Source\")\n", + " .option(\"node.keys\", \"source_id:id\")\n", + " .option(\"schema.optimization.node.keys\", \"KEY\")\n", + " .save()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:13:32.73651Z", + "execution_start_time": "2025-01-11T19:13:20.1492446Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "b3e17212-f06f-4393-9332-2857c9b9d076", + "queued_time": "2025-01-11T19:13:00.034665Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 13, + "statement_ids": [ + 13 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 13, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(\n", + " rating_df\n", + " .select(\"target_id\")\n", + " .distinct()\n", + " .write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"labels\", \":Target\")\n", + " .option(\"node.keys\", \"target_id:id\")\n", + " .option(\"schema.optimization.node.keys\", \"KEY\")\n", + " .save()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load rels" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:10:35.8072375Z", + "execution_start_time": "2025-01-11T19:08:40.1552582Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "3f09d80c-27df-4086-80f0-62d78f848826", + "queued_time": "2025-01-11T19:08:40.0073078Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 10, + "statement_ids": [ + 10 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 10, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "rel_batch_df = group_and_batch_spark_dataframe(spark_dataframe=rating_df, \n", + " source_col='source_id', \n", + " target_col='target_id', \n", + " num_groups=spark_executor_count)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:10:40.0880121Z", + "execution_start_time": "2025-01-11T19:10:36.1206943Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "c900169a-def3-41fd-99d3-2fb76a40b836", + "queued_time": "2025-01-11T19:09:19.0361583Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 11, + "statement_ids": [ + 11 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 11, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+---------+------+----------+-------+-----+\n", + "|source_id|target_id|rating| timestamp| group|batch|\n", + "+---------+---------+------+----------+-------+-----+\n", + "| 1| 7| 4.0|1081461600|3 --> 4| 2|\n", + "| 1| 19| 5.0|1065909600|3 --> 3| 1|\n", + "| 13417| 110| 5.0|1102806000|4 --> 2| 1|\n", + "| 398740| 1497| 5.0| 999554400|4 --> 3| 2|\n", + "| 55714| 6452| 5.0|1101769200|4 --> 4| 3|\n", + "| 758089| 8282| 5.0|1024178400|3 --> 4| 2|\n", + "| 758100| 9000| 4.0|1015714800|0 --> 0| 0|\n", + "| 563976| 10287| 3.0|1098655200|4 --> 4| 3|\n", + "| 28574| 14304| 1.0|1093039200|1 --> 4| 0|\n", + "| 968213| 18887| 4.0|1056146400|2 --> 2| 4|\n", + "| 6791| 23365| 4.0|1096322400|0 --> 2| 2|\n", + "| 6791| 23365| 4.0|1096322400|0 --> 2| 2|\n", + "| 94201| 26022| 4.0|1065304800|2 --> 4| 1|\n", + "| 55711| 31486| 1.0|1082844000|2 --> 3| 0|\n", + "| 55711| 31486| 1.0|1082844000|2 --> 3| 0|\n", + "| 758084| 33837| 5.0|1015542000|1 --> 0| 1|\n", + "| 55712| 49215| 4.0|1068246000|2 --> 0| 2|\n", + "| 6791| 53906| 4.0|1096408800|0 --> 1| 1|\n", + "| 28574| 54493| 3.0|1093039200|1 --> 0| 1|\n", + "| 28574| 58905| 1.0|1093039200|1 --> 1| 2|\n", + "+---------+---------+------+----------+-------+-----+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "rel_batch_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:35:18.8392667Z", + "execution_start_time": "2025-01-11T19:30:29.4126282Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "c4010936-6bbf-453c-a2d4-ea2148e55c00", + "queued_time": "2025-01-11T19:30:29.2741482Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 19, + "statement_ids": [ + 19 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 19, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "query = \"\"\"\n", + " MATCH(source:Source {id: event.source_id})\n", + " MATCH(target:Target {id: event.target_id})\n", + " MERGE(source)-[r:RELATES_TO {timestamp:event.timestamp}]->(target)\n", + " SET r.rating = event.rating\n", + " \"\"\"\n", + "\n", + "ingest_spark_dataframe(\n", + " spark_dataframe=rel_batch_df,\n", + " save_mode= \"Overwrite\",\n", + " options={\"query\":query}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Delete rels" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:41:05.7149646Z", + "execution_start_time": "2025-01-11T19:41:01.7596122Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "c17aaaf8-6ede-4ee6-81f9-2a4f75464c33", + "queued_time": "2025-01-11T19:41:01.6354079Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 26, + "statement_ids": [ + 26 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 26, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5838041 584\n" + ] + } + ], + "source": [ + "rel_count = rating_df.count()\n", + "batch_count = rel_count // 10000 + 1\n", + "print(rel_count, batch_count)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:41:10.2940843Z", + "execution_start_time": "2025-01-11T19:41:09.7728004Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "d620521c-b189-4703-a2b6-dcffda5ce9b5", + "queued_time": "2025-01-11T19:41:09.6172495Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 27, + "statement_ids": [ + 27 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 27, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "584\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import lit\n", + "del_df = (spark.range(batch_count)\n", + " .select(lit(1).alias(\"id\")))\n", + "print(del_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T19:47:33.4788284Z", + "execution_start_time": "2025-01-11T19:42:38.3485714Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "eab7a384-7dfb-40aa-86f4-b422ab134efa", + "queued_time": "2025-01-11T19:42:38.2082178Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 28, + "statement_ids": [ + 28 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 28, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "del_query = \"\"\"\n", + " MATCH ()-[r:RELATES_TO]->()\n", + " WITH r LIMIT 10000\n", + " DELETE r\"\"\"\n", + "\n", + "(\n", + " del_df.coalesce(1).write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"query\", del_query)\n", + " .option(\"batch.size\", 1)\n", + " .save()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load rels serially" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T20:00:12.8762123Z", + "execution_start_time": "2025-01-11T19:51:49.7653219Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "bfcbca3f-7389-44c8-8b19-035d4be4dc2a", + "queued_time": "2025-01-11T19:51:49.637572Z", + "session_id": "19", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 30, + "statement_ids": [ + 30 + ] + }, + "text/plain": [ + "StatementMeta(medium, 19, 30, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(\n", + " rating_df.repartition(1).write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"query\", query)\n", + " .save()\n", + ")" + ] + } + ], + "metadata": { + "description": null, + "kernelspec": { + "display_name": "python", + "name": "synapse_pyspark" + }, + "language_info": { + "name": "python" + }, + "save_output": true, + "synapse_widget": { + "state": {}, + "version": "0.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/twitch_gamers.ipynb b/examples/monopartite_twitch_gamers.ipynb similarity index 96% rename from examples/twitch_gamers.ipynb rename to examples/monopartite_twitch_gamers.ipynb index 1531eb5..6af63f9 100644 --- a/examples/twitch_gamers.ipynb +++ b/examples/monopartite_twitch_gamers.ipynb @@ -51,7 +51,8 @@ } }, "source": [ - "## Create spark session" + "## Create spark session\n", + "Update the values below for your environment" ] }, { @@ -97,12 +98,11 @@ } ], "source": [ - "username = \"NEO4J_USER_NAME\"\n", + "username = \"NEO4J_USER\"\n", "password = \"NEO4J_PASSWORD\"\n", "url = \"NEO4J_URL\"\n", - "dbname = \"NEO4J_DB\"\n", - "spark_executor_count = 5\n", - "\n", + "dbname = \"NEO4J_DATABASE\"\n", + "spark_executor_count=5\n", "\n", "spark = (\n", " SparkSession.builder\n", @@ -411,6 +411,7 @@ " .mode(\"Overwrite\")\n", " .option(\"labels\", \":Node\")\n", " .option(\"node.keys\", \"nodeId:id\")\n", + " .option(\"schema.optimization.node.keys\", \"KEY\")\n", " .save()\n", ")" ] @@ -471,7 +472,10 @@ } ], "source": [ - "rel_batch_df = group_and_batch_spark_dataframe(spark_dataframe=twitch_df, source_col='numeric_id_1', target_col='numeric_id_2', num_groups=(spark_executor_count * 2) - 1)" + "rel_batch_df = group_and_batch_spark_dataframe(spark_dataframe=twitch_df, \n", + " source_col='numeric_id_1', \n", + " target_col='numeric_id_2', \n", + " num_groups=(2 * spark_executor_count)-1)" ] }, { @@ -818,24 +822,24 @@ { "data": { "application/vnd.livy.statement-meta+json": { - "execution_finish_time": null, + "execution_finish_time": "2025-01-11T15:32:11.4027857Z", "execution_start_time": "2025-01-11T15:22:59.3132605Z", - "livy_statement_state": "running", - "normalized_state": "running", + "livy_statement_state": "available", + "normalized_state": "finished", "parent_msg_id": "d14c3ab9-8c1b-4d65-a66e-ba784a7c620e", "queued_time": "2025-01-11T15:22:59.1886902Z", "session_id": "18", "session_start_time": null, "spark_jobs": null, "spark_pool": "medium", - "state": "submitted", + "state": "finished", "statement_id": 28, "statement_ids": [ 28 ] }, "text/plain": [ - "StatementMeta(medium, 18, 28, Submitted, Running, Running)" + "StatementMeta(medium, 18, 28, Finished, Available, Finished)" ] }, "metadata": {}, @@ -854,12 +858,9 @@ } ], "metadata": { - "kernel_info": { - "name": "synapse_pyspark" - }, + "description": null, "kernelspec": { - "display_name": "Synapse PySpark", - "language": "Python", + "display_name": "python", "name": "synapse_pyspark" }, "language_info": { diff --git a/examples/predefined_components_reddit_threads.ipynb b/examples/predefined_components_reddit_threads.ipynb new file mode 100644 index 0000000..a5b0d63 --- /dev/null +++ b/examples/predefined_components_reddit_threads.ipynb @@ -0,0 +1,883 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T22:00:42.4235235Z", + "execution_start_time": "2025-01-11T22:00:42.2764302Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "04a8f60b-046e-4d2e-956f-a81a40a47cf5", + "queued_time": "2025-01-11T22:00:42.1460445Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 9, + "statement_ids": [ + 9 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 9, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.types import StructType, StructField, IntegerType\n", + "import requests\n", + "from io import BytesIO\n", + "from zipfile import ZipFile\n", + "from neo4j_parallel_spark_loader.predefined_components import group_and_batch_spark_dataframe\n", + "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Create spark session\n", + "Update the values below for your environment" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T21:44:18.5177192Z", + "execution_start_time": "2025-01-11T21:44:18.3725338Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "da70e421-13d6-4887-bbb3-d39f40e87b24", + "queued_time": "2025-01-11T21:44:18.2491489Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 3, + "statement_ids": [ + 3 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 3, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "username = \"NEO4J_USER\"\n", + "password = \"NEO4J_PASSWORD\"\n", + "url = \"NEO4J_URL\"\n", + "dbname = \"NEO4J_DATABASE\"\n", + "spark_executor_count=5\n", + "\n", + "spark = (\n", + " SparkSession.builder\n", + " .appName(\"ReditThreads\")\n", + " .config(\"neo4j.url\", url)\n", + " .config(\"url\", url)\n", + " .config(\"neo4j.authentication.basic.username\", username)\n", + " .config(\"neo4j.authentication.basic.password\", \"i2_dYwwAMKKqp7tokHoscvNJbBBn1snAZKX0uA_gffA\")\n", + " .config(\"neo4j.database\", dbname)\n", + " .getOrCreate()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Download data" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:01:14.2805724Z", + "execution_start_time": "2025-01-11T23:00:36.4081869Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "35d72265-fbd4-4fa3-91c6-238709bf8ad0", + "queued_time": "2025-01-11T23:00:35.3496326Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 45, + "statement_ids": [ + 45 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 45, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+---------+---------+\n", + "|graph_id|source_id|target_id|\n", + "+--------+---------+---------+\n", + "| 0| 0| 2|\n", + "| 0| 1| 5|\n", + "| 0| 2| 4|\n", + "| 0| 2| 5|\n", + "| 0| 2| 6|\n", + "| 0| 2| 7|\n", + "| 0| 2| 8|\n", + "| 0| 2| 9|\n", + "| 0| 2| 10|\n", + "| 0| 3| 8|\n", + "| 1| 0| 3|\n", + "| 1| 0| 6|\n", + "| 1| 1| 8|\n", + "| 1| 2| 8|\n", + "| 1| 4| 8|\n", + "| 1| 5| 8|\n", + "| 1| 6| 8|\n", + "| 1| 7| 8|\n", + "| 1| 8| 9|\n", + "| 1| 8| 10|\n", + "+--------+---------+---------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "schema = StructType([\n", + " StructField(\"graph_id\", IntegerType(), True),\n", + " StructField(\"source_id\", IntegerType(), True),\n", + " StructField(\"target_id\", IntegerType(), True)\n", + "])\n", + "\n", + "# Download and read the zip file content\n", + "response = requests.get(\"https://snap.stanford.edu/data/reddit_threads.zip\")\n", + "zip_file = ZipFile(BytesIO(response.content))\n", + "\n", + "# Read the JSON file content from the zip\n", + "with zip_file.open(\"reddit_threads/reddit_edges.json\") as file:\n", + " # Parse JSON content\n", + " data = json.loads(file.read().decode('utf-8'))\n", + " flattened = [[int(t[0]), int(sublist[0]), int(sublist[1])] for t in data.items() for sublist in t[1]]\n", + " \n", + " # Create DataFrame from parsed JSON\n", + " reddit_df = spark.createDataFrame(flattened, schema=schema)\n", + "\n", + "# Show the result\n", + "reddit_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:01:16.2592288Z", + "execution_start_time": "2025-01-11T23:01:14.4142013Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "0eb527b1-e56e-4a60-8fe6-9e4419af89bd", + "queued_time": "2025-01-11T23:00:43.7342871Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 46, + "statement_ids": [ + 46 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 46, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "5074915" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reddit_df.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:23:27.639813Z", + "execution_start_time": "2025-01-11T23:23:23.682035Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "7b8ad8aa-c3b2-4ccf-b0f3-e63f1149bd63", + "queued_time": "2025-01-11T23:23:23.5534729Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 64, + "statement_ids": [ + 64 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 64, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "4859280" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "node_df = (reddit_df\n", + " .select('graph_id', 'source_id')\n", + " .withColumnRenamed('source_id', 'nodeId')\n", + " .union(\n", + " reddit_df\n", + " .select('graph_id', 'target_id')\n", + " .withColumnRenamed('target_id', 'nodeId'))\n", + " .dropDuplicates())\n", + "node_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:26:11.8251813Z", + "execution_start_time": "2025-01-11T23:25:10.4645703Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "8ebd5d91-2211-40f6-955b-c892e42b18ed", + "queued_time": "2025-01-11T23:25:10.3417526Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 65, + "statement_ids": [ + 65 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 65, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(\n", + " node_df.write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"labels\", \":Node\")\n", + " .option(\"node.keys\", \"graph_id:graphId,nodeId:nodeId\")\n", + " .option(\"schema.optimization.node.keys\", \"KEY\")\n", + " .save()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load rels" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:26:40.0326641Z", + "execution_start_time": "2025-01-11T23:26:29.4493665Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "e541da32-90b4-479c-a59b-4f959cca62bb", + "queued_time": "2025-01-11T23:25:49.4755611Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 68, + "statement_ids": [ + 68 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 68, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "rel_batch_df = group_and_batch_spark_dataframe(spark_dataframe=reddit_df, \n", + " partition_col='graph_id', \n", + " num_groups=spark_executor_count)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:26:41.9804998Z", + "execution_start_time": "2025-01-11T23:26:40.1656981Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "969c7e39-5cf1-4c68-b3f8-733a3da7f95c", + "queued_time": "2025-01-11T23:25:53.0269186Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 69, + "statement_ids": [ + 69 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 69, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+---------+---------+-----+-----+\n", + "|graph_id|source_id|target_id|group|batch|\n", + "+--------+---------+---------+-----+-----+\n", + "| 10223| 0| 2| 3| 0|\n", + "| 10223| 1| 2| 3| 0|\n", + "| 10223| 2| 3| 3| 0|\n", + "| 10222| 7| 23| 2| 0|\n", + "| 10222| 8| 16| 2| 0|\n", + "| 10222| 9| 16| 2| 0|\n", + "| 10222| 10| 16| 2| 0|\n", + "| 10222| 11| 16| 2| 0|\n", + "| 10222| 13| 16| 2| 0|\n", + "| 10222| 14| 16| 2| 0|\n", + "| 10222| 15| 16| 2| 0|\n", + "| 10222| 16| 17| 2| 0|\n", + "| 10222| 16| 18| 2| 0|\n", + "| 10222| 16| 19| 2| 0|\n", + "| 10222| 16| 20| 2| 0|\n", + "| 10222| 16| 21| 2| 0|\n", + "| 10222| 16| 22| 2| 0|\n", + "| 10222| 16| 23| 2| 0|\n", + "| 10222| 16| 24| 2| 0|\n", + "| 10222| 16| 25| 2| 0|\n", + "+--------+---------+---------+-----+-----+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "rel_batch_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:29:15.6482319Z", + "execution_start_time": "2025-01-11T23:27:04.6742861Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "26133e00-34f6-4c97-997c-69b0ba1a7d18", + "queued_time": "2025-01-11T23:27:04.555026Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 70, + "statement_ids": [ + 70 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 70, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "query = \"\"\"\n", + " MATCH(source:Node {graphId: event.graph_id, nodeId: event.source_id})\n", + " MATCH(target:Node {graphId: event.graph_id, nodeId: event.target_id})\n", + " MERGE(source)-[r:RELATES_TO]->(target)\n", + " \"\"\"\n", + "\n", + "ingest_spark_dataframe(\n", + " spark_dataframe=rel_batch_df,\n", + " save_mode= \"Overwrite\",\n", + " options={\"query\":query}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Delete rels" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:29:28.5515873Z", + "execution_start_time": "2025-01-11T23:29:26.7485404Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "3a05e4c2-9b0f-4c01-a7d5-61b002e483bf", + "queued_time": "2025-01-11T23:29:26.6421279Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 71, + "statement_ids": [ + 71 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 71, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5074915 508\n" + ] + } + ], + "source": [ + "rel_count = reddit_df.count()\n", + "batch_count = rel_count // 10000 + 1\n", + "print(rel_count, batch_count)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:29:33.0382324Z", + "execution_start_time": "2025-01-11T23:29:32.5200288Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "db60fdcf-5a4f-443f-b6fc-03b8e65e5613", + "queued_time": "2025-01-11T23:29:32.4091933Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 72, + "statement_ids": [ + 72 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 72, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "508\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import lit\n", + "del_df = (spark.range(batch_count)\n", + " .select(lit(1).alias(\"id\")))\n", + "print(del_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:32:00.5912265Z", + "execution_start_time": "2025-01-11T23:29:36.7229458Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "048159bf-fd15-4150-bfc2-843fc27d3241", + "queued_time": "2025-01-11T23:29:36.5941022Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 73, + "statement_ids": [ + 73 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 73, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "del_query = \"\"\"\n", + " MATCH ()-[r:RELATES_TO]->()\n", + " WITH r LIMIT 10000\n", + " DELETE r\"\"\"\n", + "\n", + "(\n", + " del_df.coalesce(1).write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"query\", del_query)\n", + " .option(\"batch.size\", 1)\n", + " .save()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load rels serially" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2025-01-11T23:36:18.2333344Z", + "execution_start_time": "2025-01-11T23:32:00.7201964Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "34fccf78-1a16-4138-af26-e57972d5b79a", + "queued_time": "2025-01-11T23:29:55.6343855Z", + "session_id": "26", + "session_start_time": null, + "spark_jobs": null, + "spark_pool": "medium", + "state": "finished", + "statement_id": 74, + "statement_ids": [ + 74 + ] + }, + "text/plain": [ + "StatementMeta(medium, 26, 74, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(\n", + " reddit_df.coalesce(1).write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"query\", query)\n", + " .save()\n", + ")" + ] + } + ], + "metadata": { + "description": null, + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Synapse PySpark", + "language": "Python", + "name": "synapse_pyspark" + }, + "language_info": { + "name": "python" + }, + "save_output": true, + "synapse_widget": { + "state": {}, + "version": "0.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3a65664fdbe6f7096b340a9c97b039509f466521 Mon Sep 17 00:00:00 2001 From: Nathan Smith Date: Sun, 12 Jan 2025 09:27:41 -0600 Subject: [PATCH 07/13] Correct spacing bug --- .../monopartite/batching.py | 2 +- tests/unit/monopartite/conftest.py | 26 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/neo4j_parallel_spark_loader/monopartite/batching.py b/neo4j_parallel_spark_loader/monopartite/batching.py index 955ac8e..ae2c109 100644 --- a/neo4j_parallel_spark_loader/monopartite/batching.py +++ b/neo4j_parallel_spark_loader/monopartite/batching.py @@ -32,7 +32,7 @@ def create_ingest_batches_from_groups(spark_dataframe: DataFrame) -> DataFrame: coloring = color_complete_graph_with_self_loops(group_count) - coloring_data = [(f"{k[0]}--{k[1]}", v) for k, v in coloring.items()] + coloring_data = [(f"{k[0]} -- {k[1]}", v) for k, v in coloring.items()] # Create a DataFrame from the coloring dictionary coloring_df = spark.createDataFrame(coloring_data, ["group", "batch"]) diff --git a/tests/unit/monopartite/conftest.py b/tests/unit/monopartite/conftest.py index e1e7e93..58bbb07 100644 --- a/tests/unit/monopartite/conftest.py +++ b/tests/unit/monopartite/conftest.py @@ -6,11 +6,11 @@ @pytest.fixture(scope="module") def monopartite_batching_data() -> List[Dict[str, int]]: return [ - {"group": "1--3", "source_group": 1, "target_group": 3}, - {"group": "2--4", "source_group": 2, "target_group": 4}, - {"group": "3--5", "source_group": 5, "target_group": 3}, - {"group": "4--6", "source_group": 6, "target_group": 4}, - {"group": "0--0", "source_group": 0, "target_group": 0}, + {"group": "1 -- 3", "source_group": 1, "target_group": 3}, + {"group": "2 -- 4", "source_group": 2, "target_group": 4}, + {"group": "3 -- 5", "source_group": 5, "target_group": 3}, + {"group": "4 -- 6", "source_group": 6, "target_group": 4}, + {"group": "0 -- 0", "source_group": 0, "target_group": 0}, ] @@ -20,56 +20,56 @@ def monopartite_dupe_batching_data() -> List[Dict[str, int]]: { "source_group": 1, "target_group": 3, - "group": "1--3", + "group": "1 -- 3", "source_node": 1, "target_node": 3, }, { "source_group": 2, "target_group": 4, - "group": "2--4", + "group": "2 -- 4", "source_node": 4, "target_node": 2, }, { "source_group": 3, "target_group": 5, - "group": "3--5", + "group": "3 -- 5", "source_node": 3, "target_node": 5, }, { "source_group": 4, "target_group": 6, - "group": "4--6", + "group": "4 -- 6", "source_node": 4, "target_node": 6, }, { "source_group": 0, "target_group": 0, - "group": "0--0", + "group": "0 -- 0", "source_node": 0, "target_node": 0, }, { "source_group": 3, "target_group": 1, - "group": "1--3", + "group": "1 -- 3", "source_node": 3, "target_node": 1, }, { "source_group": 0, "target_group": 0, - "group": "0--0", + "group": "0 -- 0", "source_node": 0, "target_node": 0, }, { "source_group": 6, "target_group": 4, - "group": "4--6", + "group": "4 -- 6", "source_node": 6, "target_node": 4, }, From 72aafb9cfdef059a47e53121ed21aaad584fe1b6 Mon Sep 17 00:00:00 2001 From: Nathan Smith Date: Sun, 12 Jan 2025 11:15:35 -0600 Subject: [PATCH 08/13] Don't show empty heatmap cells --- examples/bipartite_amazon_ratings.ipynb | 580 ++++++------------ .../visualize/heatmap.py | 79 ++- 2 files changed, 234 insertions(+), 425 deletions(-) diff --git a/examples/bipartite_amazon_ratings.ipynb b/examples/bipartite_amazon_ratings.ipynb index 746c6df..87511c1 100644 --- a/examples/bipartite_amazon_ratings.ipynb +++ b/examples/bipartite_amazon_ratings.ipynb @@ -2,36 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:03:23.0244336Z", - "execution_start_time": "2025-01-11T19:03:22.8627505Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "2b085336-8c6d-4adf-97f2-4132e4403687", - "queued_time": "2025-01-11T19:03:22.102208Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 7, - "statement_ids": [ - 7 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 7, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import StructType, StructField, IntegerType, FloatType\n", @@ -39,7 +12,8 @@ "from io import BytesIO\n", "from zipfile import ZipFile\n", "from neo4j_parallel_spark_loader.bipartite import group_and_batch_spark_dataframe\n", - "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n" + "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n", + "from neo4j_parallel_spark_loader.visualize import create_ingest_heatmap\n" ] }, { @@ -52,7 +26,8 @@ } }, "source": [ - "## Create spark session" + "## Create spark session\n", + "Update the values below for your environment" ] }, { @@ -71,30 +46,20 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T18:59:51.2054347Z", - "execution_start_time": "2025-01-11T18:59:51.0447282Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "46cabf82-8138-47c2-9336-203da9e26e57", - "queued_time": "2025-01-11T18:55:55.3501895Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 3, - "statement_ids": [ - 3 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 3, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: Ignoring non-Spark config property: neo4j.url\n", + "Warning: Ignoring non-Spark config property: url\n", + "Warning: Ignoring non-Spark config property: neo4j.database\n", + "Warning: Ignoring non-Spark config property: neo4j.authentication.basic.password\n", + "Warning: Ignoring non-Spark config property: neo4j.authentication.basic.username\n", + "25/01/12 10:50:29 WARN Utils: Your hostname, Nathans-MacBook-Pro-3.local resolves to a loopback address: 127.0.0.1; using 192.168.86.181 instead (on interface en0)\n", + "25/01/12 10:50:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "25/01/12 10:50:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] } ], "source": [ @@ -131,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": { "jupyter": { "outputs_hidden": false, @@ -145,30 +110,12 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:03:31.160049Z", - "execution_start_time": "2025-01-11T19:03:25.850078Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "de533a22-47ec-4a9c-aaf8-f1c21fb818e5", - "queued_time": "2025-01-11T19:03:25.7041392Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 8, - "statement_ids": [ - 8 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 8, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 10:50:44 WARN TaskSetManager: Stage 0 contains a task of very large size (20312 KiB). The maximum recommended task size is 1000 KiB.\n", + "[Stage 0:> (0 + 1) / 1]\r" + ] }, { "name": "stdout", @@ -207,6 +154,14 @@ " |-- timestamp: integer (nullable = true)\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 10:50:49 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 0 (TID 0): Attempting to kill Python Worker\n", + " \r" + ] } ], "source": [ @@ -241,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": { "jupyter": { "outputs_hidden": false, @@ -255,30 +210,12 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:03:43.4584278Z", - "execution_start_time": "2025-01-11T19:03:39.0646295Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "e387e6cb-4355-4128-baad-c2eccad4ca78", - "queued_time": "2025-01-11T19:03:38.9431924Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 9, - "statement_ids": [ - 9 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 9, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 10:50:49 WARN TaskSetManager: Stage 1 contains a task of very large size (20312 KiB). The maximum recommended task size is 1000 KiB.\n", + " \r" + ] }, { "data": { @@ -286,7 +223,7 @@ "5838041" ] }, - "execution_count": 19, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -310,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -322,34 +259,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:13:20.0134295Z", - "execution_start_time": "2025-01-11T19:12:59.1775369Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "f6a2aa90-0280-43ba-9760-a3ebf360b3f1", - "queued_time": "2025-01-11T19:12:59.0570586Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 12, - "statement_ids": [ - 12 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 12, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "(\n", " rating_df\n", @@ -367,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -379,34 +289,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:13:32.73651Z", - "execution_start_time": "2025-01-11T19:13:20.1492446Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "b3e17212-f06f-4393-9332-2857c9b9d076", - "queued_time": "2025-01-11T19:13:00.034665Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 13, - "statement_ids": [ - 13 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 13, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "(\n", " rating_df\n", @@ -437,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": { "jupyter": { "outputs_hidden": false, @@ -451,30 +334,34 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:10:35.8072375Z", - "execution_start_time": "2025-01-11T19:08:40.1552582Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "3f09d80c-27df-4086-80f0-62d78f848826", - "queued_time": "2025-01-11T19:08:40.0073078Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 10, - "statement_ids": [ - 10 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 10, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 10:53:29 WARN TaskSetManager: Stage 4 contains a task of very large size (20312 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:54:20 WARN TaskSetManager: Stage 12 contains a task of very large size (20312 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:54:50 WARN TaskSetManager: Stage 20 contains a task of very large size (20312 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:54:55 WARN TaskSetManager: Stage 21 contains a task of very large size (2348 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:54:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:54:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:54:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:54:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:54:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:54:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:54:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:54:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:01 WARN TaskSetManager: Stage 34 contains a task of very large size (20312 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:55:07 WARN TaskSetManager: Stage 35 contains a task of very large size (2348 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:55:08 WARN TaskSetManager: Stage 36 contains a task of very large size (1339 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:55:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + " \r" + ] } ], "source": [ @@ -486,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": { "jupyter": { "outputs_hidden": false, @@ -500,30 +387,22 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:10:40.0880121Z", - "execution_start_time": "2025-01-11T19:10:36.1206943Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "c900169a-def3-41fd-99d3-2fb76a40b836", - "queued_time": "2025-01-11T19:09:19.0361583Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 11, - "statement_ids": [ - 11 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 11, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 10:55:16 WARN TaskSetManager: Stage 58 contains a task of very large size (20312 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:55:20 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 58 (TID 134): Attempting to kill Python Worker\n", + "25/01/12 10:55:20 WARN TaskSetManager: Stage 59 contains a task of very large size (2348 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:55:20 WARN PythonRunner: Detected deadlock while completing task 5.0 in stage 58 (TID 139): Attempting to kill Python Worker\n", + "25/01/12 10:55:20 WARN PythonRunner: Detected deadlock while completing task 7.0 in stage 58 (TID 141): Attempting to kill Python Worker\n", + "25/01/12 10:55:20 WARN PythonRunner: Detected deadlock while completing task 2.0 in stage 58 (TID 136): Attempting to kill Python Worker\n", + "25/01/12 10:55:20 WARN PythonRunner: Detected deadlock while completing task 1.0 in stage 58 (TID 135): Attempting to kill Python Worker\n", + "25/01/12 10:55:20 WARN PythonRunner: Detected deadlock while completing task 6.0 in stage 58 (TID 140): Attempting to kill Python Worker\n", + "25/01/12 10:55:20 WARN PythonRunner: Detected deadlock while completing task 4.0 in stage 58 (TID 138): Attempting to kill Python Worker\n", + "25/01/12 10:55:20 WARN PythonRunner: Detected deadlock while completing task 3.0 in stage 58 (TID 137): Attempting to kill Python Worker\n", + "25/01/12 10:55:21 WARN TaskSetManager: Stage 60 contains a task of very large size (1339 KiB). The maximum recommended task size is 1000 KiB.\n", + " \r" + ] }, { "name": "stdout", @@ -532,26 +411,26 @@ "+---------+---------+------+----------+-------+-----+\n", "|source_id|target_id|rating| timestamp| group|batch|\n", "+---------+---------+------+----------+-------+-----+\n", - "| 1| 7| 4.0|1081461600|3 --> 4| 2|\n", - "| 1| 19| 5.0|1065909600|3 --> 3| 1|\n", - "| 13417| 110| 5.0|1102806000|4 --> 2| 1|\n", - "| 398740| 1497| 5.0| 999554400|4 --> 3| 2|\n", - "| 55714| 6452| 5.0|1101769200|4 --> 4| 3|\n", - "| 758089| 8282| 5.0|1024178400|3 --> 4| 2|\n", - "| 758100| 9000| 4.0|1015714800|0 --> 0| 0|\n", - "| 563976| 10287| 3.0|1098655200|4 --> 4| 3|\n", - "| 28574| 14304| 1.0|1093039200|1 --> 4| 0|\n", - "| 968213| 18887| 4.0|1056146400|2 --> 2| 4|\n", - "| 6791| 23365| 4.0|1096322400|0 --> 2| 2|\n", - "| 6791| 23365| 4.0|1096322400|0 --> 2| 2|\n", - "| 94201| 26022| 4.0|1065304800|2 --> 4| 1|\n", - "| 55711| 31486| 1.0|1082844000|2 --> 3| 0|\n", - "| 55711| 31486| 1.0|1082844000|2 --> 3| 0|\n", - "| 758084| 33837| 5.0|1015542000|1 --> 0| 1|\n", - "| 55712| 49215| 4.0|1068246000|2 --> 0| 2|\n", - "| 6791| 53906| 4.0|1096408800|0 --> 1| 1|\n", - "| 28574| 54493| 3.0|1093039200|1 --> 0| 1|\n", - "| 28574| 58905| 1.0|1093039200|1 --> 1| 2|\n", + "| 1| 7| 4.0|1081461600|4 --> 4| 3|\n", + "| 1| 19| 5.0|1065909600|4 --> 2| 1|\n", + "| 41421| 3243| 1.0|1113256800|2 --> 1| 3|\n", + "| 16832| 5177| 1.0|1097532000|1 --> 3| 4|\n", + "| 636810| 8321| 2.0| 955058400|4 --> 1| 0|\n", + "| 16831| 18546| 5.0| 956008800|3 --> 4| 2|\n", + "| 1673392| 36750| 1.0|1139094000|0 --> 3| 3|\n", + "| 16831| 42744| 4.0|1008284400|3 --> 3| 1|\n", + "| 16831| 44234| 5.0| 984956400|3 --> 0| 3|\n", + "| 1197279| 79793| 5.0|1102114800|2 --> 2| 4|\n", + "| 41420| 85397| 5.0|1043708400|0 --> 4| 4|\n", + "| 636807| 131380| 5.0| 943225200|4 --> 1| 0|\n", + "| 236796| 220160| 5.0|1126303200|3 --> 2| 0|\n", + "| 236796| 237793| 5.0|1144188000|3 --> 1| 4|\n", + "| 636818| 420727| 1.0|1138057200|0 --> 1| 1|\n", + "| 236796| 497225| 5.0|1122415200|3 --> 1| 4|\n", + "| 41421| 555044| 5.0|1112911200|2 --> 4| 1|\n", + "| 236792| 838278| 3.0|1074726000|0 --> 3| 3|\n", + "| 236798| 838281| 4.0|1000850400|0 --> 2| 2|\n", + "| 636809| 868684| 2.0| 951260400|0 --> 1| 1|\n", "+---------+---------+------+----------+-------+-----+\n", "only showing top 20 rows\n", "\n" @@ -564,7 +443,55 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 10:55:23 WARN TaskSetManager: Stage 69 contains a task of very large size (20312 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:55:28 WARN TaskSetManager: Stage 70 contains a task of very large size (2348 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:55:29 WARN TaskSetManager: Stage 71 contains a task of very large size (1339 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 10:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + "25/01/12 10:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", + " \r" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "create_ingest_heatmap(rel_batch_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -576,34 +503,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:35:18.8392667Z", - "execution_start_time": "2025-01-11T19:30:29.4126282Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "c4010936-6bbf-453c-a2d4-ea2148e55c00", - "queued_time": "2025-01-11T19:30:29.2741482Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 19, - "statement_ids": [ - 19 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 19, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "query = \"\"\"\n", " MATCH(source:Source {id: event.source_id})\n", @@ -634,7 +534,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -646,41 +546,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:41:05.7149646Z", - "execution_start_time": "2025-01-11T19:41:01.7596122Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "c17aaaf8-6ede-4ee6-81f9-2a4f75464c33", - "queued_time": "2025-01-11T19:41:01.6354079Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 26, - "statement_ids": [ - 26 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 26, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5838041 584\n" - ] - } - ], + "outputs": [], "source": [ "rel_count = rating_df.count()\n", "batch_count = rel_count // 10000 + 1\n", @@ -689,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -701,41 +567,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:41:10.2940843Z", - "execution_start_time": "2025-01-11T19:41:09.7728004Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "d620521c-b189-4703-a2b6-dcffda5ce9b5", - "queued_time": "2025-01-11T19:41:09.6172495Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 27, - "statement_ids": [ - 27 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 27, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "584\n" - ] - } - ], + "outputs": [], "source": [ "from pyspark.sql.functions import lit\n", "del_df = (spark.range(batch_count)\n", @@ -745,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -757,34 +589,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T19:47:33.4788284Z", - "execution_start_time": "2025-01-11T19:42:38.3485714Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "eab7a384-7dfb-40aa-86f4-b422ab134efa", - "queued_time": "2025-01-11T19:42:38.2082178Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 28, - "statement_ids": [ - 28 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 28, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "del_query = \"\"\"\n", " MATCH ()-[r:RELATES_TO]->()\n", @@ -816,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -828,34 +633,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T20:00:12.8762123Z", - "execution_start_time": "2025-01-11T19:51:49.7653219Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "bfcbca3f-7389-44c8-8b19-035d4be4dc2a", - "queued_time": "2025-01-11T19:51:49.637572Z", - "session_id": "19", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 30, - "statement_ids": [ - 30 - ] - }, - "text/plain": [ - "StatementMeta(medium, 19, 30, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "(\n", " rating_df.repartition(1).write\n", @@ -870,11 +648,21 @@ "metadata": { "description": null, "kernelspec": { - "display_name": "python", - "name": "synapse_pyspark" + "display_name": ".venv", + "language": "python", + "name": "python3" }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" }, "save_output": true, "synapse_widget": { diff --git a/neo4j_parallel_spark_loader/visualize/heatmap.py b/neo4j_parallel_spark_loader/visualize/heatmap.py index b8b02e3..321b895 100644 --- a/neo4j_parallel_spark_loader/visualize/heatmap.py +++ b/neo4j_parallel_spark_loader/visualize/heatmap.py @@ -1,14 +1,13 @@ from typing import Any, Dict, List - import numpy as np import seaborn as sns from matplotlib.axes import Axes from pyspark.sql import DataFrame - +import matplotlib.pyplot as plt def _format_spark_dataframe_for_visualization( spark_dataframe: DataFrame, -) -> List[Dict[str, int]]: +) -> List[Dict[str, Any]]: """ Prepare a Pandas DataFrame to be displayed as a heatmap visualizing the group and batch counts. @@ -19,20 +18,18 @@ def _format_spark_dataframe_for_visualization( Returns ------- - List[Dict[str, int]] + List[Dict[str, Any]] The Spark DataFrame contents processed and formatted as a list of dictionaries. """ - counts_sdf = spark_dataframe.groupBy("group", "batch").count() return [row.asDict() for row in counts_sdf.collect()] - def create_ingest_heatmap( - spark_dataframe: DataFrame, title: str = "Parallel Ingest Heat Map" + spark_dataframe: DataFrame, title: str = "Parallel Ingest Heat Map", figsize=(8, 12) ) -> Axes: """ Create the ingest heatmap from a list of dictionaries. - This heatmap will display the groups on the y-axis and batches on the x-axis in sequential order. + This heatmap will display batches on the y-axis and group numbers on the x-axis. Parameters ---------- @@ -40,45 +37,69 @@ def create_ingest_heatmap( A Spark DataFrame with columns including 'group', 'batch' and 'count' title : str, optional A title for the visualization, by default "Parallel Ingest Heat Map" + figsize : tuple, optional + Figure size (width, height) in inches, by default (8, 12) Returns ------- Axes A Matplotlib Axes object for visualization. """ - data = _format_spark_dataframe_for_visualization(spark_dataframe=spark_dataframe) assert ( set(data[0].keys()) == {"group", "batch", "count"} - ), "Invalid keys detected in data. Dictionary keys must contain only 'group', 'batch' and 'count'." - - X_KEY = "batch" - Y_KEY = "group" - VALUE_KEY = "count" + ), "Invalid keys detected in data. Dictionary keys must contain only 'group', 'batch' and 'count'" + + # Create a dictionary to store group-to-number mapping for each batch + batch_group_mappings = {} + for d in data: + batch = d["batch"] + if batch not in batch_group_mappings: + batch_group_mappings[batch] = {} + batch_group_mappings[batch][d["group"]] = len(batch_group_mappings[batch]) + 1 + + # Transform data with group numbers + transformed_data = [] + for d in data: + transformed_data.append({ + "batch": d["batch"], + "group_num": batch_group_mappings[d["batch"]][d["group"]], + "count": d["count"], + "original_group": d["group"] + }) # Extract unique x and y values - x_values = sorted(set(d[X_KEY] for d in data)) - y_values = sorted(set(d[Y_KEY] for d in data)) + y_values = sorted(set(d["batch"] for d in transformed_data), reverse=True) + x_values = sorted(set(d["group_num"] for d in transformed_data)) - # Create a 2D numpy array for the heatmap + # Create 2D numpy arrays for the heatmap heatmap_data = np.zeros((len(y_values), len(x_values))) - - # Fill the array with values - for item in data: - x_idx = x_values.index(item[X_KEY]) - y_idx = y_values.index(item[Y_KEY]) - heatmap_data[y_idx, x_idx] = item[VALUE_KEY] - + annotation_labels = np.empty((len(y_values), len(x_values)), dtype=object) + + # Fill the arrays with values + for item in transformed_data: + y_idx = y_values.index(item["batch"]) + x_idx = x_values.index(item["group_num"]) + heatmap_data[y_idx, x_idx] = item["count"] + # Create annotation with count and original group name + annotation_labels[y_idx, x_idx] = f"{item['count']:,.0f}\n({item['original_group']})" + + # Create figure with specified size + plt.figure(figsize=figsize) + + # Create heatmap ax = sns.heatmap( data=heatmap_data, - annot=True, + annot=annotation_labels, + fmt="", xticklabels=x_values, yticklabels=y_values, linewidths=0.5, ) - ax.set_xlabel("Batch") - ax.set_ylabel("Group") + + ax.set_xlabel("Group Number") + ax.set_ylabel("Batch") ax.set_title(title) - ax.invert_yaxis() - return ax + + return ax \ No newline at end of file From 74f640b12b8694625c61eb36dd33257945bab479 Mon Sep 17 00:00:00 2001 From: Nathan Smith Date: Sun, 12 Jan 2025 16:08:31 -0600 Subject: [PATCH 09/13] update visualization --- examples/monopartite_twitch_gamers.ipynb | 550 +++++------------- ...predefined_components_reddit_threads.ipynb | 313 +++++----- 2 files changed, 281 insertions(+), 582 deletions(-) diff --git a/examples/monopartite_twitch_gamers.ipynb b/examples/monopartite_twitch_gamers.ipynb index 6af63f9..0bf13af 100644 --- a/examples/monopartite_twitch_gamers.ipynb +++ b/examples/monopartite_twitch_gamers.ipynb @@ -4,41 +4,15 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T14:49:41.2174665Z", - "execution_start_time": "2025-01-11T14:49:40.6965287Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "20a7c7a6-49c7-4b5a-9f32-5b15f0e51019", - "queued_time": "2025-01-11T14:48:11.9992599Z", - "session_id": "18", - "session_start_time": "2025-01-11T14:48:12.0369851Z", - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 2, - "statement_ids": [ - 2 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 2, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "import requests\n", "from io import BytesIO\n", "from zipfile import ZipFile\n", "from neo4j_parallel_spark_loader.monopartite import group_and_batch_spark_dataframe\n", - "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n" + "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n", + "from neo4j_parallel_spark_loader.visualize import create_ingest_heatmap" ] }, { @@ -71,30 +45,18 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T14:49:41.5117197Z", - "execution_start_time": "2025-01-11T14:49:41.3585458Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "769d32ac-fec7-4674-b91c-561e4424b35a", - "queued_time": "2025-01-11T14:48:28.1869292Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 3, - "statement_ids": [ - 3 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 3, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: Ignoring non-Spark config property: neo4j.url\n", + "Warning: Ignoring non-Spark config property: url\n", + "Warning: Ignoring non-Spark config property: neo4j.database\n", + "Warning: Ignoring non-Spark config property: neo4j.authentication.basic.password\n", + "Warning: Ignoring non-Spark config property: neo4j.authentication.basic.username\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "25/01/12 09:48:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] } ], "source": [ @@ -145,30 +107,14 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T14:50:23.0615084Z", - "execution_start_time": "2025-01-11T14:49:55.5797104Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "2d565aae-42f2-4029-a551-702623320475", - "queued_time": "2025-01-11T14:49:55.4616629Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 4, - "statement_ids": [ - 4 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 4, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 09:48:34 WARN TaskSetManager: Stage 0 contains a task of very large size (12201 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 09:48:39 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 0 (TID 0): Attempting to kill Python Worker\n", + "25/01/12 09:48:40 WARN TaskSetManager: Stage 1 contains a task of very large size (12201 KiB). The maximum recommended task size is 1000 KiB.\n", + "[Stage 1:> (0 + 1) / 1]\r" + ] }, { "name": "stdout", @@ -205,6 +151,14 @@ " |-- numeric_id_2: string (nullable = true)\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 09:48:44 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 1 (TID 1): Attempting to kill Python Worker\n", + " \r" + ] } ], "source": [ @@ -231,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "jupyter": { "outputs_hidden": false, @@ -245,30 +199,12 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T14:50:40.3463803Z", - "execution_start_time": "2025-01-11T14:50:31.2930721Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "ec3dba3e-335f-4a2b-8f13-7f034153d961", - "queued_time": "2025-01-11T14:50:31.1556519Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 6, - "statement_ids": [ - 6 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 6, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 09:48:44 WARN TaskSetManager: Stage 2 contains a task of very large size (12201 KiB). The maximum recommended task size is 1000 KiB.\n", + " \r" + ] }, { "data": { @@ -276,7 +212,7 @@ "6797557" ] }, - "execution_count": 13, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -300,7 +236,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -312,44 +248,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T14:51:48.9139321Z", - "execution_start_time": "2025-01-11T14:51:42.0394455Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "5886e1ba-297a-42b5-8d17-7d141e9a261f", - "queued_time": "2025-01-11T14:51:41.8997493Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 9, - "statement_ids": [ - 9 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 9, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "168114" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "node_df = (twitch_df\n", " .select('numeric_id_1')\n", @@ -364,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -376,34 +275,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T14:52:25.8382593Z", - "execution_start_time": "2025-01-11T14:52:15.2936378Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "00844db7-d433-41f6-bb3a-6d30eed6a6d0", - "queued_time": "2025-01-11T14:52:15.1638534Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 11, - "statement_ids": [ - 11 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 11, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "(\n", " node_df.write\n", @@ -431,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": { "jupyter": { "outputs_hidden": false, @@ -445,30 +317,13 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T14:55:23.0498345Z", - "execution_start_time": "2025-01-11T14:55:00.1139093Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "93db93ce-ba8c-4141-9062-ba6a9c681008", - "queued_time": "2025-01-11T14:55:00.0018896Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 14, - "statement_ids": [ - 14 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 14, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 09:48:52 WARN TaskSetManager: Stage 5 contains a task of very large size (12201 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 09:49:23 WARN TaskSetManager: Stage 13 contains a task of very large size (12201 KiB). The maximum recommended task size is 1000 KiB.\n", + " \r" + ] } ], "source": [ @@ -480,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "metadata": { "jupyter": { "outputs_hidden": false, @@ -494,30 +349,20 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T14:56:52.1245437Z", - "execution_start_time": "2025-01-11T14:56:48.2089393Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "6afd2da0-a917-4ac5-b24e-c0f609582ca7", - "queued_time": "2025-01-11T14:56:48.0701321Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 15, - "statement_ids": [ - 15 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 15, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 09:49:44 WARN TaskSetManager: Stage 29 contains a task of very large size (12201 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 09:49:48 WARN PythonRunner: Detected deadlock while completing task 5.0 in stage 29 (TID 87): Attempting to kill Python Worker\n", + "25/01/12 09:49:48 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 29 (TID 82): Attempting to kill Python Worker\n", + "25/01/12 09:49:48 WARN PythonRunner: Detected deadlock while completing task 1.0 in stage 29 (TID 83): Attempting to kill Python Worker\n", + "25/01/12 09:49:48 WARN PythonRunner: Detected deadlock while completing task 6.0 in stage 29 (TID 88): Attempting to kill Python Worker\n", + "25/01/12 09:49:48 WARN PythonRunner: Detected deadlock while completing task 2.0 in stage 29 (TID 84): Attempting to kill Python Worker\n", + "25/01/12 09:49:48 WARN PythonRunner: Detected deadlock while completing task 3.0 in stage 29 (TID 85): Attempting to kill Python Worker\n", + "25/01/12 09:49:48 WARN PythonRunner: Detected deadlock while completing task 7.0 in stage 29 (TID 89): Attempting to kill Python Worker\n", + "25/01/12 09:49:48 WARN PythonRunner: Detected deadlock while completing task 4.0 in stage 29 (TID 86): Attempting to kill Python Worker\n", + " \r" + ] }, { "name": "stdout", @@ -526,26 +371,26 @@ "+------------+------------+------+-----+\n", "|numeric_id_1|numeric_id_2| group|batch|\n", "+------------+------------+------+-----+\n", - "| 111207| 159752|1 -- 5| 3|\n", - "| 111207| 145773|1 -- 5| 3|\n", - "| 111207| 42059|1 -- 6| 8|\n", - "| 111207| 29852|1 -- 7| 4|\n", - "| 111207| 75443|1 -- 1| 1|\n", - "| 111207| 12216|1 -- 2| 6|\n", - "| 111207| 123168|0 -- 1| 5|\n", - "| 111207| 40798|1 -- 1| 1|\n", - "| 111207| 52070|0 -- 1| 5|\n", - "| 111207| 118432|1 -- 3| 2|\n", - "| 111207| 140508|1 -- 2| 6|\n", - "| 111207| 78550|1 -- 8| 0|\n", - "| 111207| 76995|1 -- 2| 6|\n", - "| 111207| 73856|1 -- 3| 2|\n", - "| 111207| 154308|1 -- 2| 6|\n", - "| 111207| 129693|0 -- 1| 5|\n", - "| 111207| 63938|1 -- 1| 1|\n", - "| 111207| 21381|1 -- 4| 7|\n", - "| 111207| 28488|1 -- 2| 6|\n", - "| 111207| 13564|1 -- 1| 1|\n", + "| 132936| 55282|5 -- 7| 6|\n", + "| 132936| 52228|1 -- 7| 4|\n", + "| 132936| 87835|6 -- 7| 2|\n", + "| 132936| 91750|7 -- 8| 3|\n", + "| 132936| 23194|3 -- 7| 5|\n", + "| 132936| 76703|2 -- 7| 0|\n", + "| 132936| 7938|7 -- 8| 3|\n", + "| 132936| 150412|7 -- 7| 7|\n", + "| 132936| 2158|5 -- 7| 6|\n", + "| 132936| 90372|7 -- 8| 3|\n", + "| 132936| 165616|4 -- 7| 1|\n", + "| 132936| 108001|7 -- 8| 3|\n", + "| 132936| 55253|7 -- 7| 7|\n", + "| 132936| 156774|7 -- 7| 7|\n", + "| 132936| 155380|2 -- 7| 0|\n", + "| 132936| 109338|6 -- 7| 2|\n", + "| 132936| 65772|6 -- 7| 2|\n", + "| 132936| 161490|0 -- 7| 8|\n", + "| 132936| 93083|4 -- 7| 1|\n", + "| 132936| 53374|7 -- 8| 3|\n", "+------------+------------+------+-----+\n", "only showing top 20 rows\n", "\n" @@ -558,7 +403,45 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 09:49:51 WARN TaskSetManager: Stage 37 contains a task of very large size (12201 KiB). The maximum recommended task size is 1000 KiB.\n", + " \r" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "create_ingest_heatmap(rel_batch_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -570,34 +453,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T15:01:54.9500155Z", - "execution_start_time": "2025-01-11T14:57:24.5030447Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "ecdaef15-dc48-48f2-b23b-b53daa35827f", - "queued_time": "2025-01-11T14:57:24.3876952Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 16, - "statement_ids": [ - 16 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 16, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "query = \"\"\"\n", " MATCH(source:Node {id: event.numeric_id_1})\n", @@ -627,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -639,41 +495,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T15:03:46.0902216Z", - "execution_start_time": "2025-01-11T15:03:43.2960105Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "f7cb9b0e-b957-4dae-93d6-0c93f3846fe5", - "queued_time": "2025-01-11T15:03:43.1740342Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 20, - "statement_ids": [ - 20 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 20, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6797557 680\n" - ] - } - ], + "outputs": [], "source": [ "rel_count = twitch_df.count()\n", "batch_count = rel_count // 10000 + 1\n", @@ -682,7 +504,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -694,41 +516,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T15:09:07.318409Z", - "execution_start_time": "2025-01-11T15:09:06.8032688Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "be93cdfa-06c6-4d6e-bc5a-0146f77d1cd1", - "queued_time": "2025-01-11T15:09:06.6770525Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 25, - "statement_ids": [ - 25 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 25, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "680\n" - ] - } - ], + "outputs": [], "source": [ "from pyspark.sql.functions import lit\n", "del_df = (spark.range(batch_count)\n", @@ -738,7 +526,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -750,34 +538,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T15:16:01.4072209Z", - "execution_start_time": "2025-01-11T15:10:19.9712186Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "457a965b-b574-4751-85c2-dc2b43e2c3a2", - "queued_time": "2025-01-11T15:10:19.851916Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 27, - "statement_ids": [ - 27 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 27, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "del_query = \"MATCH ()-[r:RELATES_TO]->() WITH r LIMIT 10000 DELETE r\"\n", "\n", @@ -806,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, @@ -818,34 +579,7 @@ } } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T15:32:11.4027857Z", - "execution_start_time": "2025-01-11T15:22:59.3132605Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "d14c3ab9-8c1b-4d65-a66e-ba784a7c620e", - "queued_time": "2025-01-11T15:22:59.1886902Z", - "session_id": "18", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 28, - "statement_ids": [ - 28 - ] - }, - "text/plain": [ - "StatementMeta(medium, 18, 28, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "(\n", " twitch_df.coalesce(1).write\n", @@ -860,11 +594,21 @@ "metadata": { "description": null, "kernelspec": { - "display_name": "python", - "name": "synapse_pyspark" + "display_name": ".venv", + "language": "python", + "name": "python3" }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" }, "save_output": true, "synapse_widget": { diff --git a/examples/predefined_components_reddit_threads.ipynb b/examples/predefined_components_reddit_threads.ipynb index a5b0d63..bbb4b49 100644 --- a/examples/predefined_components_reddit_threads.ipynb +++ b/examples/predefined_components_reddit_threads.ipynb @@ -2,36 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 10, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T22:00:42.4235235Z", - "execution_start_time": "2025-01-11T22:00:42.2764302Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "04a8f60b-046e-4d2e-956f-a81a40a47cf5", - "queued_time": "2025-01-11T22:00:42.1460445Z", - "session_id": "26", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 9, - "statement_ids": [ - 9 - ] - }, - "text/plain": [ - "StatementMeta(medium, 26, 9, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import StructType, StructField, IntegerType\n", @@ -40,6 +13,7 @@ "from zipfile import ZipFile\n", "from neo4j_parallel_spark_loader.predefined_components import group_and_batch_spark_dataframe\n", "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n", + "from neo4j_parallel_spark_loader.visualize import create_ingest_heatmap\n", "import json" ] }, @@ -59,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": { "jupyter": { "outputs_hidden": false, @@ -73,30 +47,20 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T21:44:18.5177192Z", - "execution_start_time": "2025-01-11T21:44:18.3725338Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "da70e421-13d6-4887-bbb3-d39f40e87b24", - "queued_time": "2025-01-11T21:44:18.2491489Z", - "session_id": "26", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 3, - "statement_ids": [ - 3 - ] - }, - "text/plain": [ - "StatementMeta(medium, 26, 3, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: Ignoring non-Spark config property: neo4j.url\n", + "Warning: Ignoring non-Spark config property: url\n", + "Warning: Ignoring non-Spark config property: neo4j.database\n", + "Warning: Ignoring non-Spark config property: neo4j.authentication.basic.password\n", + "Warning: Ignoring non-Spark config property: neo4j.authentication.basic.username\n", + "25/01/12 16:04:43 WARN Utils: Your hostname, Nathans-MacBook-Pro-3.local resolves to a loopback address: 127.0.0.1; using 192.168.86.181 instead (on interface en0)\n", + "25/01/12 16:04:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "25/01/12 16:04:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] } ], "source": [ @@ -133,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 3, "metadata": { "jupyter": { "outputs_hidden": false, @@ -147,30 +111,13 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T23:01:14.2805724Z", - "execution_start_time": "2025-01-11T23:00:36.4081869Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "35d72265-fbd4-4fa3-91c6-238709bf8ad0", - "queued_time": "2025-01-11T23:00:35.3496326Z", - "session_id": "26", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 45, - "statement_ids": [ - 45 - ] - }, - "text/plain": [ - "StatementMeta(medium, 26, 45, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 16:05:54 WARN TaskSetManager: Stage 0 contains a task of very large size (5590 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 16:06:00 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 0 (TID 0): Attempting to kill Python Worker\n", + " \r" + ] }, { "name": "stdout", @@ -231,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 4, "metadata": { "jupyter": { "outputs_hidden": false, @@ -245,30 +192,12 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T23:01:16.2592288Z", - "execution_start_time": "2025-01-11T23:01:14.4142013Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "0eb527b1-e56e-4a60-8fe6-9e4419af89bd", - "queued_time": "2025-01-11T23:00:43.7342871Z", - "session_id": "26", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 46, - "statement_ids": [ - 46 - ] - }, - "text/plain": [ - "StatementMeta(medium, 26, 46, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 16:06:02 WARN TaskSetManager: Stage 1 contains a task of very large size (5590 KiB). The maximum recommended task size is 1000 KiB.\n", + " \r" + ] }, { "data": { @@ -276,7 +205,7 @@ "5074915" ] }, - "execution_count": 93, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -431,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 5, "metadata": { "jupyter": { "outputs_hidden": false, @@ -445,30 +374,12 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T23:26:40.0326641Z", - "execution_start_time": "2025-01-11T23:26:29.4493665Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "e541da32-90b4-479c-a59b-4f959cca62bb", - "queued_time": "2025-01-11T23:25:49.4755611Z", - "session_id": "26", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 68, - "statement_ids": [ - 68 - ] - }, - "text/plain": [ - "StatementMeta(medium, 26, 68, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 16:06:08 WARN TaskSetManager: Stage 4 contains a task of very large size (5590 KiB). The maximum recommended task size is 1000 KiB.\n", + " \r" + ] } ], "source": [ @@ -479,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 6, "metadata": { "jupyter": { "outputs_hidden": false, @@ -493,30 +404,20 @@ }, "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2025-01-11T23:26:41.9804998Z", - "execution_start_time": "2025-01-11T23:26:40.1656981Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "969c7e39-5cf1-4c68-b3f8-733a3da7f95c", - "queued_time": "2025-01-11T23:25:53.0269186Z", - "session_id": "26", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": "medium", - "state": "finished", - "statement_id": 69, - "statement_ids": [ - 69 - ] - }, - "text/plain": [ - "StatementMeta(medium, 26, 69, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 16:06:28 WARN TaskSetManager: Stage 12 contains a task of very large size (5590 KiB). The maximum recommended task size is 1000 KiB.\n", + "25/01/12 16:06:32 WARN PythonRunner: Detected deadlock while completing task 2.0 in stage 12 (TID 25): Attempting to kill Python Worker\n", + "25/01/12 16:06:32 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 12 (TID 23): Attempting to kill Python Worker\n", + "25/01/12 16:06:32 WARN PythonRunner: Detected deadlock while completing task 1.0 in stage 12 (TID 24): Attempting to kill Python Worker\n", + "25/01/12 16:06:32 WARN PythonRunner: Detected deadlock while completing task 4.0 in stage 12 (TID 27): Attempting to kill Python Worker\n", + "25/01/12 16:06:32 WARN PythonRunner: Detected deadlock while completing task 3.0 in stage 12 (TID 26): Attempting to kill Python Worker\n", + "25/01/12 16:06:32 WARN PythonRunner: Detected deadlock while completing task 7.0 in stage 12 (TID 30): Attempting to kill Python Worker\n", + "25/01/12 16:06:32 WARN PythonRunner: Detected deadlock while completing task 6.0 in stage 12 (TID 29): Attempting to kill Python Worker\n", + "25/01/12 16:06:32 WARN PythonRunner: Detected deadlock while completing task 5.0 in stage 12 (TID 28): Attempting to kill Python Worker\n", + "[Stage 13:====================================> (5 + 3) / 8]\r" + ] }, { "name": "stdout", @@ -525,36 +426,81 @@ "+--------+---------+---------+-----+-----+\n", "|graph_id|source_id|target_id|group|batch|\n", "+--------+---------+---------+-----+-----+\n", - "| 10223| 0| 2| 3| 0|\n", - "| 10223| 1| 2| 3| 0|\n", - "| 10223| 2| 3| 3| 0|\n", - "| 10222| 7| 23| 2| 0|\n", - "| 10222| 8| 16| 2| 0|\n", - "| 10222| 9| 16| 2| 0|\n", - "| 10222| 10| 16| 2| 0|\n", - "| 10222| 11| 16| 2| 0|\n", - "| 10222| 13| 16| 2| 0|\n", - "| 10222| 14| 16| 2| 0|\n", - "| 10222| 15| 16| 2| 0|\n", - "| 10222| 16| 17| 2| 0|\n", - "| 10222| 16| 18| 2| 0|\n", - "| 10222| 16| 19| 2| 0|\n", - "| 10222| 16| 20| 2| 0|\n", - "| 10222| 16| 21| 2| 0|\n", - "| 10222| 16| 22| 2| 0|\n", - "| 10222| 16| 23| 2| 0|\n", - "| 10222| 16| 24| 2| 0|\n", - "| 10222| 16| 25| 2| 0|\n", + "| 0| 0| 2| 0| 0|\n", + "| 0| 1| 5| 0| 0|\n", + "| 0| 2| 4| 0| 0|\n", + "| 0| 2| 5| 0| 0|\n", + "| 0| 2| 6| 0| 0|\n", + "| 0| 2| 7| 0| 0|\n", + "| 0| 2| 8| 0| 0|\n", + "| 0| 2| 9| 0| 0|\n", + "| 0| 2| 10| 0| 0|\n", + "| 0| 3| 8| 0| 0|\n", + "| 1| 0| 3| 3| 0|\n", + "| 1| 0| 6| 3| 0|\n", + "| 1| 1| 8| 3| 0|\n", + "| 1| 2| 8| 3| 0|\n", + "| 1| 4| 8| 3| 0|\n", + "| 1| 5| 8| 3| 0|\n", + "| 1| 6| 8| 3| 0|\n", + "| 1| 7| 8| 3| 0|\n", + "| 1| 8| 9| 3| 0|\n", + "| 1| 8| 10| 3| 0|\n", "+--------+---------+---------+-----+-----+\n", "only showing top 20 rows\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] } ], "source": [ "rel_batch_df.show()" ] }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/12 16:07:41 WARN TaskSetManager: Stage 35 contains a task of very large size (5590 KiB). The maximum recommended task size is 1000 KiB.\n", + " \r" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "create_ingest_heatmap(rel_batch_df, figsize=(8, 2))" + ] + }, { "cell_type": "code", "execution_count": 71, @@ -865,12 +811,21 @@ "name": "synapse_pyspark" }, "kernelspec": { - "display_name": "Synapse PySpark", - "language": "Python", - "name": "synapse_pyspark" + "display_name": ".venv", + "language": "python", + "name": "python3" }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" }, "save_output": true, "synapse_widget": { From 167fbd82712503b696ca561246ab8b56e87f33c3 Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 13 Jan 2025 12:37:38 -0600 Subject: [PATCH 10/13] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 947febb..d8e283c 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,4 @@ cython_debug/ # PyPI configuration file .pypirc +Neo4j-0f015ca4-Created-2025-01-09.txt From 6018cbf89207391f548c161285ba9d9a0169ed6e Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 13 Jan 2025 13:19:57 -0600 Subject: [PATCH 11/13] add requestst to dev, make figsize optional in viz --- .../visualize/heatmap.py | 8 +- poetry.lock | 261 ++++++++++++++---- pyproject.toml | 3 +- 3 files changed, 209 insertions(+), 63 deletions(-) diff --git a/neo4j_parallel_spark_loader/visualize/heatmap.py b/neo4j_parallel_spark_loader/visualize/heatmap.py index 321b895..5fd32a8 100644 --- a/neo4j_parallel_spark_loader/visualize/heatmap.py +++ b/neo4j_parallel_spark_loader/visualize/heatmap.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple import numpy as np import seaborn as sns from matplotlib.axes import Axes @@ -25,7 +25,7 @@ def _format_spark_dataframe_for_visualization( return [row.asDict() for row in counts_sdf.collect()] def create_ingest_heatmap( - spark_dataframe: DataFrame, title: str = "Parallel Ingest Heat Map", figsize=(8, 12) + spark_dataframe: DataFrame, title: str = "Parallel Ingest Heat Map", figsize: Optional[Tuple[float, float]] = None ) -> Axes: """ Create the ingest heatmap from a list of dictionaries. @@ -38,7 +38,7 @@ def create_ingest_heatmap( title : str, optional A title for the visualization, by default "Parallel Ingest Heat Map" figsize : tuple, optional - Figure size (width, height) in inches, by default (8, 12) + Figure size (width, height) in inches, by default None Returns ------- @@ -97,7 +97,7 @@ def create_ingest_heatmap( yticklabels=y_values, linewidths=0.5, ) - + # plt.clim(0) ax.set_xlabel("Group Number") ax.set_ylabel("Batch") ax.set_title(title) diff --git a/poetry.lock b/poetry.lock index c93acf9..85d84e1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,117 @@ # This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +[[package]] +name = "certifi" +version = "2024.12.14" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56"}, + {file = "certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.1" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7" +files = [ + {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-win32.whl", hash = "sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30bf9fd9be89ecb2360c7d94a711f00c09b976258846efe40db3d05828e8089"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:97f68b8d6831127e4787ad15e6757232e14e12060bec17091b85eb1486b91d8d"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7974a0b5ecd505609e3b19742b60cee7aa2aa2fb3151bc917e6e2646d7667dcf"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc54db6c8593ef7d4b2a331b58653356cf04f67c960f584edb7c3d8c97e8f39e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:311f30128d7d333eebd7896965bfcfbd0065f1716ec92bd5638d7748eb6f936a"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:7d053096f67cd1241601111b698f5cad775f97ab25d81567d3f59219b5f1adbd"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:807f52c1f798eef6cf26beb819eeb8819b1622ddfeef9d0977a8502d4db6d534"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:dccbe65bd2f7f7ec22c4ff99ed56faa1e9f785482b9bbd7c717e26fd723a1d1e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:2fb9bd477fdea8684f78791a6de97a953c51831ee2981f8e4f583ff3b9d9687e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:01732659ba9b5b873fc117534143e4feefecf3b2078b0a6a2e925271bb6f4cfa"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:7a4f97a081603d2050bfaffdefa5b02a9ec823f8348a572e39032caa8404a487"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7b1bef6280950ee6c177b326508f86cad7ad4dff12454483b51d8b7d673a2c5d"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-win32.whl", hash = "sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765"}, + {file = "charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85"}, + {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"}, +] + [[package]] name = "colorama" version = "0.4.6" @@ -260,6 +372,20 @@ ufo = ["fs (>=2.2.0,<3)"] unicode = ["unicodedata2 (>=15.1.0)"] woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] +[[package]] +name = "idna" +version = "3.10" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.6" +files = [ + {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, + {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, +] + +[package.extras] +all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] + [[package]] name = "iniconfig" version = "2.0.0" @@ -438,66 +564,47 @@ pyarrow = ["pyarrow (>=1.0.0)"] [[package]] name = "numpy" -version = "2.2.1" +version = "1.26.4" description = "Fundamental package for array computing in Python" optional = false -python-versions = ">=3.10" +python-versions = ">=3.9" files = [ - {file = "numpy-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5edb4e4caf751c1518e6a26a83501fda79bff41cc59dac48d70e6d65d4ec4440"}, - {file = "numpy-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa3017c40d513ccac9621a2364f939d39e550c542eb2a894b4c8da92b38896ab"}, - {file = "numpy-2.2.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:61048b4a49b1c93fe13426e04e04fdf5a03f456616f6e98c7576144677598675"}, - {file = "numpy-2.2.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:7671dc19c7019103ca44e8d94917eba8534c76133523ca8406822efdd19c9308"}, - {file = "numpy-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4250888bcb96617e00bfa28ac24850a83c9f3a16db471eca2ee1f1714df0f957"}, - {file = "numpy-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7746f235c47abc72b102d3bce9977714c2444bdfaea7888d241b4c4bb6a78bf"}, - {file = "numpy-2.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:059e6a747ae84fce488c3ee397cee7e5f905fd1bda5fb18c66bc41807ff119b2"}, - {file = "numpy-2.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f62aa6ee4eb43b024b0e5a01cf65a0bb078ef8c395e8713c6e8a12a697144528"}, - {file = "numpy-2.2.1-cp310-cp310-win32.whl", hash = "sha256:48fd472630715e1c1c89bf1feab55c29098cb403cc184b4859f9c86d4fcb6a95"}, - {file = "numpy-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:b541032178a718c165a49638d28272b771053f628382d5e9d1c93df23ff58dbf"}, - {file = "numpy-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:40f9e544c1c56ba8f1cf7686a8c9b5bb249e665d40d626a23899ba6d5d9e1484"}, - {file = "numpy-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9b57eaa3b0cd8db52049ed0330747b0364e899e8a606a624813452b8203d5f7"}, - {file = "numpy-2.2.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bc8a37ad5b22c08e2dbd27df2b3ef7e5c0864235805b1e718a235bcb200cf1cb"}, - {file = "numpy-2.2.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:9036d6365d13b6cbe8f27a0eaf73ddcc070cae584e5ff94bb45e3e9d729feab5"}, - {file = "numpy-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51faf345324db860b515d3f364eaa93d0e0551a88d6218a7d61286554d190d73"}, - {file = "numpy-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38efc1e56b73cc9b182fe55e56e63b044dd26a72128fd2fbd502f75555d92591"}, - {file = "numpy-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:31b89fa67a8042e96715c68e071a1200c4e172f93b0fbe01a14c0ff3ff820fc8"}, - {file = "numpy-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c86e2a209199ead7ee0af65e1d9992d1dce7e1f63c4b9a616500f93820658d0"}, - {file = "numpy-2.2.1-cp311-cp311-win32.whl", hash = "sha256:b34d87e8a3090ea626003f87f9392b3929a7bbf4104a05b6667348b6bd4bf1cd"}, - {file = "numpy-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:360137f8fb1b753c5cde3ac388597ad680eccbbbb3865ab65efea062c4a1fd16"}, - {file = "numpy-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:694f9e921a0c8f252980e85bce61ebbd07ed2b7d4fa72d0e4246f2f8aa6642ab"}, - {file = "numpy-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3683a8d166f2692664262fd4900f207791d005fb088d7fdb973cc8d663626faa"}, - {file = "numpy-2.2.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:780077d95eafc2ccc3ced969db22377b3864e5b9a0ea5eb347cc93b3ea900315"}, - {file = "numpy-2.2.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:55ba24ebe208344aa7a00e4482f65742969a039c2acfcb910bc6fcd776eb4355"}, - {file = "numpy-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b1d07b53b78bf84a96898c1bc139ad7f10fda7423f5fd158fd0f47ec5e01ac7"}, - {file = "numpy-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5062dc1a4e32a10dc2b8b13cedd58988261416e811c1dc4dbdea4f57eea61b0d"}, - {file = "numpy-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fce4f615f8ca31b2e61aa0eb5865a21e14f5629515c9151850aa936c02a1ee51"}, - {file = "numpy-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:67d4cda6fa6ffa073b08c8372aa5fa767ceb10c9a0587c707505a6d426f4e046"}, - {file = "numpy-2.2.1-cp312-cp312-win32.whl", hash = "sha256:32cb94448be47c500d2c7a95f93e2f21a01f1fd05dd2beea1ccd049bb6001cd2"}, - {file = "numpy-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:ba5511d8f31c033a5fcbda22dd5c813630af98c70b2661f2d2c654ae3cdfcfc8"}, - {file = "numpy-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f1d09e520217618e76396377c81fba6f290d5f926f50c35f3a5f72b01a0da780"}, - {file = "numpy-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3ecc47cd7f6ea0336042be87d9e7da378e5c7e9b3c8ad0f7c966f714fc10d821"}, - {file = "numpy-2.2.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f419290bc8968a46c4933158c91a0012b7a99bb2e465d5ef5293879742f8797e"}, - {file = "numpy-2.2.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5b6c390bfaef8c45a260554888966618328d30e72173697e5cabe6b285fb2348"}, - {file = "numpy-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:526fc406ab991a340744aad7e25251dd47a6720a685fa3331e5c59fef5282a59"}, - {file = "numpy-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f74e6fdeb9a265624ec3a3918430205dff1df7e95a230779746a6af78bc615af"}, - {file = "numpy-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:53c09385ff0b72ba79d8715683c1168c12e0b6e84fb0372e97553d1ea91efe51"}, - {file = "numpy-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f3eac17d9ec51be534685ba877b6ab5edc3ab7ec95c8f163e5d7b39859524716"}, - {file = "numpy-2.2.1-cp313-cp313-win32.whl", hash = "sha256:9ad014faa93dbb52c80d8f4d3dcf855865c876c9660cb9bd7553843dd03a4b1e"}, - {file = "numpy-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:164a829b6aacf79ca47ba4814b130c4020b202522a93d7bff2202bfb33b61c60"}, - {file = "numpy-2.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4dfda918a13cc4f81e9118dea249e192ab167a0bb1966272d5503e39234d694e"}, - {file = "numpy-2.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:733585f9f4b62e9b3528dd1070ec4f52b8acf64215b60a845fa13ebd73cd0712"}, - {file = "numpy-2.2.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:89b16a18e7bba224ce5114db863e7029803c179979e1af6ad6a6b11f70545008"}, - {file = "numpy-2.2.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:676f4eebf6b2d430300f1f4f4c2461685f8269f94c89698d832cdf9277f30b84"}, - {file = "numpy-2.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f5cdf9f493b35f7e41e8368e7d7b4bbafaf9660cba53fb21d2cd174ec09631"}, - {file = "numpy-2.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1ad395cf254c4fbb5b2132fee391f361a6e8c1adbd28f2cd8e79308a615fe9d"}, - {file = "numpy-2.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:08ef779aed40dbc52729d6ffe7dd51df85796a702afbf68a4f4e41fafdc8bda5"}, - {file = "numpy-2.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:26c9c4382b19fcfbbed3238a14abf7ff223890ea1936b8890f058e7ba35e8d71"}, - {file = "numpy-2.2.1-cp313-cp313t-win32.whl", hash = "sha256:93cf4e045bae74c90ca833cba583c14b62cb4ba2cba0abd2b141ab52548247e2"}, - {file = "numpy-2.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bff7d8ec20f5f42607599f9994770fa65d76edca264a87b5e4ea5629bce12268"}, - {file = "numpy-2.2.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7ba9cc93a91d86365a5d270dee221fdc04fb68d7478e6bf6af650de78a8339e3"}, - {file = "numpy-2.2.1-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:3d03883435a19794e41f147612a77a8f56d4e52822337844fff3d4040a142964"}, - {file = "numpy-2.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4511d9e6071452b944207c8ce46ad2f897307910b402ea5fa975da32e0102800"}, - {file = "numpy-2.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5c5cc0cbabe9452038ed984d05ac87910f89370b9242371bd9079cb4af61811e"}, - {file = "numpy-2.2.1.tar.gz", hash = "sha256:45681fd7128c8ad1c379f0ca0776a8b0c6583d2f69889ddac01559dfe4390918"}, + {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, + {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"}, + {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"}, + {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"}, + {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"}, + {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"}, + {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"}, + {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"}, + {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"}, + {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, + {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] [[package]] @@ -823,6 +930,27 @@ files = [ {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"}, ] +[[package]] +name = "requests" +version = "2.32.3" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.8" +files = [ + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + [[package]] name = "ruff" version = "0.3.7" @@ -933,7 +1061,24 @@ files = [ {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, ] +[[package]] +name = "urllib3" +version = "2.3.0" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.9" +files = [ + {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, + {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "9ddd24c13f7089658add40758983e01316712c5ac1997fff8eab1acb440ec18c" +content-hash = "3c58cf2642e706f06b592d49d46af80286f2752343362093a58def6b6cb8f416" diff --git a/pyproject.toml b/pyproject.toml index 7c01ba5..2f680fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ exclude = ["tests/*", "benchmarking/*", "docs/*", "examples/*", "Makefile"] [tool.poetry.dependencies] python = "^3.10" pyspark = "^3.4.0" -numpy = "^2.2.1" +numpy = "^1.2.0" [tool.poetry.group.dev.dependencies] @@ -21,6 +21,7 @@ pytest-mock = "^3.12.0" python-dotenv = "^1.0.1" ruff = "^0.3.0" seaborn = "^0.13.2" +requests = "^2.32.3" [tool.ruff] target-version = "py312" From 7b5b9604f2e0736a9b03a1bf6a732ac98009c4c1 Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 13 Jan 2025 13:31:26 -0600 Subject: [PATCH 12/13] update examples with dotenv --- examples/bipartite_amazon_ratings.ipynb | 22 +++++------ examples/heatmap_example.ipynb | 4 +- examples/monopartite_twitch_gamers.ipynb | 21 +++++----- ...predefined_components_reddit_threads.ipynb | 39 ++++++++++++------- .../monopartite/grouping.py | 29 +++++++------- .../visualize/heatmap.py | 39 ++++++++++++------- 6 files changed, 90 insertions(+), 64 deletions(-) diff --git a/examples/bipartite_amazon_ratings.ipynb b/examples/bipartite_amazon_ratings.ipynb index 87511c1..dc844d8 100644 --- a/examples/bipartite_amazon_ratings.ipynb +++ b/examples/bipartite_amazon_ratings.ipynb @@ -6,6 +6,8 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", + "from dotenv import load_dotenv\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import StructType, StructField, IntegerType, FloatType\n", "import requests\n", @@ -13,7 +15,9 @@ "from zipfile import ZipFile\n", "from neo4j_parallel_spark_loader.bipartite import group_and_batch_spark_dataframe\n", "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n", - "from neo4j_parallel_spark_loader.visualize import create_ingest_heatmap\n" + "from neo4j_parallel_spark_loader.visualize import create_ingest_heatmap\n", + "\n", + "load_dotenv()\n" ] }, { @@ -63,20 +67,16 @@ } ], "source": [ - "username = \"NEO4J_USER\"\n", - "password = \"NEO4J_PASSWORD\"\n", - "url = \"NEO4J_URL\"\n", - "dbname = \"NEO4J_DATABASE\"\n", "spark_executor_count=5\n", "\n", "spark = (\n", " SparkSession.builder\n", " .appName(\"AmazonRatings\")\n", - " .config(\"neo4j.url\", url)\n", - " .config(\"url\", url)\n", - " .config(\"neo4j.authentication.basic.username\", username)\n", - " .config(\"neo4j.authentication.basic.password\", password)\n", - " .config(\"neo4j.database\", dbname)\n", + " .config(\"neo4j.url\", os.environ.get(\"NEO4J_URI\"))\n", + " .config(\"url\", os.environ.get(\"NEO4J_URI\"))\n", + " .config(\"neo4j.authentication.basic.username\", os.environ.get(\"USERNAME\"))\n", + " .config(\"neo4j.authentication.basic.password\", os.environ.get(\"NEO4J_PASSWORD\"))\n", + " .config(\"neo4j.database\", os.environ.get(\"NEO4J_DATABASE\"))\n", " .getOrCreate()\n", ")" ] @@ -662,7 +662,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.3" }, "save_output": true, "synapse_widget": { diff --git a/examples/heatmap_example.ipynb b/examples/heatmap_example.ipynb index f3fc7c4..45f585e 100644 --- a/examples/heatmap_example.ipynb +++ b/examples/heatmap_example.ipynb @@ -72,7 +72,7 @@ "text": [ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", - "25/01/06 09:16:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + "25/01/13 13:23:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" ] } ], @@ -125,7 +125,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] diff --git a/examples/monopartite_twitch_gamers.ipynb b/examples/monopartite_twitch_gamers.ipynb index 0bf13af..2956c5c 100644 --- a/examples/monopartite_twitch_gamers.ipynb +++ b/examples/monopartite_twitch_gamers.ipynb @@ -6,13 +6,18 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", "from pyspark.sql import SparkSession\n", "import requests\n", "from io import BytesIO\n", "from zipfile import ZipFile\n", "from neo4j_parallel_spark_loader.monopartite import group_and_batch_spark_dataframe\n", "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n", - "from neo4j_parallel_spark_loader.visualize import create_ingest_heatmap" + "from neo4j_parallel_spark_loader.visualize import create_ingest_heatmap\n", + "\n", + "load_dotenv()" ] }, { @@ -60,20 +65,16 @@ } ], "source": [ - "username = \"NEO4J_USER\"\n", - "password = \"NEO4J_PASSWORD\"\n", - "url = \"NEO4J_URL\"\n", - "dbname = \"NEO4J_DATABASE\"\n", "spark_executor_count=5\n", "\n", "spark = (\n", " SparkSession.builder\n", " .appName(\"TwitchGamers\")\n", - " .config(\"neo4j.url\", url)\n", - " .config(\"url\", url)\n", - " .config(\"neo4j.authentication.basic.username\", username)\n", - " .config(\"neo4j.authentication.basic.password\", password)\n", - " .config(\"neo4j.database\", dbname)\n", + " .config(\"neo4j.url\", os.environ.get(\"NEO4J_URI\"))\n", + " .config(\"url\", os.environ.get(\"NEO4J_URI\"))\n", + " .config(\"neo4j.authentication.basic.username\", os.environ.get(\"USERNAME\"))\n", + " .config(\"neo4j.authentication.basic.password\", os.environ.get(\"NEO4J_PASSWORD\"))\n", + " .config(\"neo4j.database\", os.environ.get(\"NEO4J_DATABASE\"))\n", " .getOrCreate()\n", ")" ] diff --git a/examples/predefined_components_reddit_threads.ipynb b/examples/predefined_components_reddit_threads.ipynb index bbb4b49..c272dfc 100644 --- a/examples/predefined_components_reddit_threads.ipynb +++ b/examples/predefined_components_reddit_threads.ipynb @@ -2,19 +2,36 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import StructType, StructField, IntegerType\n", + "\n", "import requests\n", "from io import BytesIO\n", "from zipfile import ZipFile\n", "from neo4j_parallel_spark_loader.predefined_components import group_and_batch_spark_dataframe\n", "from neo4j_parallel_spark_loader import ingest_spark_dataframe\n", "from neo4j_parallel_spark_loader.visualize import create_ingest_heatmap\n", - "import json" + "import json\n", + "\n", + "load_dotenv()" ] }, { @@ -64,20 +81,16 @@ } ], "source": [ - "username = \"NEO4J_USER\"\n", - "password = \"NEO4J_PASSWORD\"\n", - "url = \"NEO4J_URL\"\n", - "dbname = \"NEO4J_DATABASE\"\n", "spark_executor_count=5\n", "\n", "spark = (\n", " SparkSession.builder\n", " .appName(\"ReditThreads\")\n", - " .config(\"neo4j.url\", url)\n", - " .config(\"url\", url)\n", - " .config(\"neo4j.authentication.basic.username\", username)\n", - " .config(\"neo4j.authentication.basic.password\", \"i2_dYwwAMKKqp7tokHoscvNJbBBn1snAZKX0uA_gffA\")\n", - " .config(\"neo4j.database\", dbname)\n", + " .config(\"neo4j.url\", os.environ.get(\"NEO4J_URI\"))\n", + " .config(\"url\", os.environ.get(\"NEO4J_URI\"))\n", + " .config(\"neo4j.authentication.basic.username\", os.environ.get(\"USERNAME\"))\n", + " .config(\"neo4j.authentication.basic.password\", os.environ.get(\"NEO4J_PASSWORD\"))\n", + " .config(\"neo4j.database\", os.environ.get(\"NEO4J_DATABASE\"))\n", " .getOrCreate()\n", ")" ] @@ -825,7 +838,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.3" }, "save_output": true, "synapse_widget": { diff --git a/neo4j_parallel_spark_loader/monopartite/grouping.py b/neo4j_parallel_spark_loader/monopartite/grouping.py index 1051876..47c6b94 100644 --- a/neo4j_parallel_spark_loader/monopartite/grouping.py +++ b/neo4j_parallel_spark_loader/monopartite/grouping.py @@ -50,20 +50,21 @@ def create_node_groupings( grouping_column="combined_col", ) - final_sdf = (spark_dataframe - .join( - other=keys_sdf.withColumnRenamed("group", "source_group"), - on=(spark_dataframe[source_col] == keys_sdf.value), - how="left" - ) - .drop(keys_sdf.value) - .join( - other=keys_sdf.withColumnRenamed("group", "target_group"), - on=(spark_dataframe[target_col] == keys_sdf.value), - how="left" - ) - .drop(keys_sdf.value) - .drop("value")) + final_sdf = ( + spark_dataframe.join( + other=keys_sdf.withColumnRenamed("group", "source_group"), + on=(spark_dataframe[source_col] == keys_sdf.value), + how="left", + ) + .drop(keys_sdf.value) + .join( + other=keys_sdf.withColumnRenamed("group", "target_group"), + on=(spark_dataframe[target_col] == keys_sdf.value), + how="left", + ) + .drop(keys_sdf.value) + .drop("value") + ) final_sdf = final_sdf.withColumn( "group", diff --git a/neo4j_parallel_spark_loader/visualize/heatmap.py b/neo4j_parallel_spark_loader/visualize/heatmap.py index 5fd32a8..b3ec205 100644 --- a/neo4j_parallel_spark_loader/visualize/heatmap.py +++ b/neo4j_parallel_spark_loader/visualize/heatmap.py @@ -1,9 +1,11 @@ from typing import Any, Dict, List, Optional, Tuple + +import matplotlib.pyplot as plt import numpy as np import seaborn as sns from matplotlib.axes import Axes from pyspark.sql import DataFrame -import matplotlib.pyplot as plt + def _format_spark_dataframe_for_visualization( spark_dataframe: DataFrame, @@ -24,12 +26,16 @@ def _format_spark_dataframe_for_visualization( counts_sdf = spark_dataframe.groupBy("group", "batch").count() return [row.asDict() for row in counts_sdf.collect()] + def create_ingest_heatmap( - spark_dataframe: DataFrame, title: str = "Parallel Ingest Heat Map", figsize: Optional[Tuple[float, float]] = None + spark_dataframe: DataFrame, + title: str = "Parallel Ingest Heat Map", + figsize: Optional[Tuple[float, float]] = None, ) -> Axes: """ Create the ingest heatmap from a list of dictionaries. This heatmap will display batches on the y-axis and group numbers on the x-axis. + Group IDs will be displayed in parenthesis below the value count in each cell. Parameters ---------- @@ -62,12 +68,14 @@ def create_ingest_heatmap( # Transform data with group numbers transformed_data = [] for d in data: - transformed_data.append({ - "batch": d["batch"], - "group_num": batch_group_mappings[d["batch"]][d["group"]], - "count": d["count"], - "original_group": d["group"] - }) + transformed_data.append( + { + "batch": d["batch"], + "group_num": batch_group_mappings[d["batch"]][d["group"]], + "count": d["count"], + "original_group": d["group"], + } + ) # Extract unique x and y values y_values = sorted(set(d["batch"] for d in transformed_data), reverse=True) @@ -83,11 +91,13 @@ def create_ingest_heatmap( x_idx = x_values.index(item["group_num"]) heatmap_data[y_idx, x_idx] = item["count"] # Create annotation with count and original group name - annotation_labels[y_idx, x_idx] = f"{item['count']:,.0f}\n({item['original_group']})" + annotation_labels[y_idx, x_idx] = ( + f"{item['count']:,.0f}\n({item['original_group']})" + ) # Create figure with specified size plt.figure(figsize=figsize) - + # Create heatmap ax = sns.heatmap( data=heatmap_data, @@ -96,10 +106,11 @@ def create_ingest_heatmap( xticklabels=x_values, yticklabels=y_values, linewidths=0.5, + vmin=0, ) - # plt.clim(0) - ax.set_xlabel("Group Number") + + ax.set_xlabel("Spark Processor Node Number") ax.set_ylabel("Batch") ax.set_title(title) - - return ax \ No newline at end of file + + return ax From 5b8ea254779496f784d1d1be56d76e3f7633c904 Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 13 Jan 2025 13:42:51 -0600 Subject: [PATCH 13/13] Update CHANGELOG.md --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52cc112..ce1ad61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,16 @@ ### Changed +* Swap heatmap visualization axes +* Add group ID to each cell in heatmap +* Heatmap scale start at 0 +* Update monopartite batching algorithm +* Update ingest algorithm + ### Added +* Examples demonstrating each parallel ingest method with real data + ## 0.2.4 ### Fixed