diff --git a/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/ImageGenChat-cli.py b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/ImageGenChat-cli.py new file mode 100644 index 00000000..843002e3 --- /dev/null +++ b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/ImageGenChat-cli.py @@ -0,0 +1,269 @@ +import base64 +import io +import json +import logging +import os +from datetime import datetime +import random +from PIL import Image + +import inquirer +from file_utils import save_base64_image + +from amazon_image_gen import BedrockImageGenerator +from ImageGenChat import ImageGenChat + +image_resolutions = { + "16:9": {"width": 1280, "height": 720}, + "1:1": {"width": 1024, "height": 1024}, + "9:16": {"width": 720, "height": 1280}, +} + + +class FileFilter(logging.Filter): + def filter(self, record): + # Only allow logs from test_enhance_prompt.py and ImageGenChat.py + return record.filename in ["test_enhance_prompt.py", "ImageGenChat.py"] + + +def configure_logging(): + # Configure the root logger + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + # Create a handler with your desired format + handler = logging.StreamHandler() + handler.setFormatter( + logging.Formatter( + "%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s" + ) + ) + + # Add the filter to the handler + handler.addFilter(FileFilter()) + + # Add the handler to the logger + logger.addHandler(handler) + + # Clear any existing handlers from the root logger to avoid duplicate logs + for hdlr in logger.handlers[:]: + if isinstance(hdlr, logging.StreamHandler) and hdlr != handler: + logger.removeHandler(hdlr) + + return logger + + +def validate_int(answers, current): + try: + value = int(current) + if value < 0: + return "Please enter a positive number" + return True + except ValueError: + return "Please enter a valid number" + + +def get_starting_input(): + questions = [ + inquirer.List( + "model", + message="Select a LLM to use for enhancement", + choices=[ + "us.amazon.nova-pro-v1:0", + "us.amazon.nova-lite-v1:0", + "us.amazon.nova-micro-v1:0", + ], + ), + inquirer.Text( + "max_turns_to_track", + message="Enter the number of turns to remember", + validate=validate_int, + default="4", + ), + inquirer.List( + "enable_image_gen", message="Generate images?", choices=["No", "Yes"] + ), + ] + answers = inquirer.prompt(questions) + + # If the user chose to generation images, present them with a list of resolutions to choose from. + if answers["enable_image_gen"] == "Yes": + resolution_questions = [ + inquirer.List( + "resolution", + message="Select an image resolution", + choices=image_resolutions.keys(), + ), + inquirer.List( + "quality", + message="Select a quality setting", + choices=["standard", "premium"], + ), + ] + resolution_answers = inquirer.prompt(resolution_questions) + selected_resolution = image_resolutions[resolution_answers["resolution"]] + quality = resolution_answers["quality"] + + return ( + answers["model"], + int(answers["max_turns_to_track"]), + bool(answers["enable_image_gen"] == "Yes"), + ( + selected_resolution + if answers["enable_image_gen"] == "Yes" + else image_resolutions["16:9"] + ), + quality if answers["enable_image_gen"] == "Yes" else "No", + ) + + +def save_debugging_artifacts(image_chat, output_dir): + # Create the folder if it doesn't exist. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Save the system prompt that was used. + system_prompt_path = os.path.join(output_dir, "system_prompt.txt") + with open(system_prompt_path, "w") as f: + f.write(image_chat.system_prompt) + + # Save the conversation message structure as a JSON file. + history_path = os.path.join(output_dir, "chat_history.json") + with open(history_path, "w") as f: + history_flattened = json.dumps(image_chat.chat_history, indent=2) + f.write(history_flattened) + + # Save the conversation as a more human-readable text file. + history_path = os.path.join(output_dir, "chat_history.md") + with open(history_path, "w") as f: + f.write(image_chat.get_chat_history_as_markdown()) + + +def display_chat_response(chat_response): + # At a minimum, there will always be a userIntent and a narrativeResponse. + user_intent = chat_response["userIntent"] + final_prompt = chat_response["finalPrompt"] + negative_prompt = chat_response["negativePrompt"] + narrative_response = chat_response["narrativeResponse"] + + next_action_options = chat_response.get("newIdeas", None) + + # Display the user intent. + print(f"\nUser Intent:\n{user_intent}") + + # Display enhanced prompt and negative prompt. + print(f"\nEnhanced Prompt:\n{final_prompt}") + print(f"\nNegative Prompt:\n{negative_prompt}") + + # Display the list of suggested prompts if they exist. + if next_action_options is not None: + print("\nSuggestions:") + for label in next_action_options: + print(f"- {label}") + + # Display the model's narrative response (which may be a follow up question) + print(f"\nNarrative Response:\n{narrative_response}") + + +def act_on_chat_response( + chat_response, enable_image_gen, width, height, quality, output_dir +): + # Generate and display an image if requested. + if enable_image_gen: + enhanced_prompt = chat_response["finalPrompt"] + negative_prompt = chat_response["negativePrompt"] + + # Create the generator. + generator = BedrockImageGenerator(output_directory=output_dir) + + # Configure the inference parameters. + seed = random.randint(0, 2147483646) + + inference_params = { + "taskType": "TEXT_IMAGE", + "textToImageParams": {"text": enhanced_prompt}, + "imageGenerationConfig": { + "numberOfImages": 1, + "width": width, + "height": height, + "quality": quality, + "cfgScale": 4.0, + "seed": seed, + }, + } + + # Add negative prompt if provided. + if negative_prompt: + inference_params["textToImageParams"]["negativeText"] = negative_prompt + + print("\nGenerating image...") + try: + # Generate the image(s). + response = generator.generate_images(inference_params) + + # Check for valid images. + images = response.get("images", []) + if len(images) == 0: + print("No images were generated.") + + # Check for an error message. + if response.get("error", None) is not None: + print(f"Error: {response['error']}") + return + + # Save the image as a PNG and display it. + image_base64 = images[0] + image_bytes = base64.b64decode(image_base64) + image = Image.open(io.BytesIO(image_bytes)) + image.show() + + except Exception as e: + print(f"Sorry. Image generation failed: {e}") + + +def main(): + # Configure logging. + logger = configure_logging() + + # Gather user preferences interactively. + print("") + selected_model, max_turns_to_track, enable_image_gen, resolution, quality = ( + get_starting_input() + ) + + print("\nEnter the prompt you would like to enhance. Enter 'q' to quit.\n") + + # Create an instance of ImageGenChat which manages conversation history and + # user input. + image_chat = ImageGenChat( + turn_memory_count=max_turns_to_track, + model_id=selected_model, + region_name="us-east-1", + ) + + # Keep asking the user for input until they end the session. + while True: + # Have the user enter a prompt. + user_input = input("\nPrompt: ") + if user_input == "q": + break + + # Create a folder name based on the current time. + folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + output_dir = os.path.join(os.path.dirname(__file__), "output", folder_name) + + chat_response = image_chat.process_user_input(user_input, output_dir=output_dir) + + save_debugging_artifacts(image_chat, output_dir) + display_chat_response(chat_response) + act_on_chat_response( + chat_response, + enable_image_gen, + resolution["width"], + resolution["height"], + quality, + output_dir, + ) + + +main() diff --git a/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/ImageGenChat-test.py b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/ImageGenChat-test.py new file mode 100644 index 00000000..c6abf910 --- /dev/null +++ b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/ImageGenChat-test.py @@ -0,0 +1,79 @@ +from ImageGenChat import ImageGenChat +import time +from datetime import datetime +import os +import json + + +def measure_malformed_json_rate( + sample_size, + model_id, + turn_memory_count=4, + region_name="us-east-1", + max_requests_per_minute=20, +): + """ + Measure the rate of malformed JSON responses from the ImageGenChat class. + + Args: + sample_size (int): The number of samples to test. + model_id (str): The model ID to use. + turn_memory_count (int): The number of turns to remember. + region_name (str): The AWS region name. + max_requests_per_minute (int): The maximum number of requests per minute. + + Returns: + float: The percentage of malformed JSON responses. + """ + + print(f"Measuring malformed JSON rate with model {model_id}...") + + # Create an instance of ImageGenChat which manages conversation history and + # user input. + image_chat = ImageGenChat( + turn_memory_count=turn_memory_count, + model_id=model_id, + region_name=region_name, + ) + + # Generate a sample of malformed JSON responses. + malformed_json_count = 0 + last_invocation_time = time.time() + + print("\nMeasuring malformed JSON rate...") + + for index in range(sample_size): + print(f"Sample {index + 1} of {sample_size}") + + # Create a folder name based on the current time. + folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + output_dir = os.path.join(os.path.dirname(__file__), "output", folder_name) + + user_text = "Create a beach scene" + try: + image_chat.process_user_input(user_text=user_text, output_dir=output_dir) + except json.JSONDecodeError: + malformed_json_count += 1 + except Exception as e: + print(f"Unexpected error: {e}") + + # Implement rate limiting + current_time = time.time() + elapsed_time = current_time - last_invocation_time + if elapsed_time < 60 / max_requests_per_minute: + time.sleep(60 / max_requests_per_minute - elapsed_time) + + # Calculate the percentage of malformed JSON responses. + malformed_json_rate = (malformed_json_count / sample_size) * 100 + return malformed_json_rate + + +def main(): + malformed_json_rate = measure_malformed_json_rate( + sample_size=100, model_id="us.amazon.nova-lite-v1:0", max_requests_per_minute=20 + ) + print(f"Malformed JSON rate: {malformed_json_rate}%") + + +if __name__ == "__main__": + main() diff --git a/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/ImageGenChat.py b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/ImageGenChat.py new file mode 100644 index 00000000..9de67f65 --- /dev/null +++ b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/ImageGenChat.py @@ -0,0 +1,151 @@ +import json +import logging +import os + +import boto3 + +# Create logger. +logger = logging.getLogger(__name__) + + +class ImageGenChat: + """ + This class handles image geneneration conversation flow and provides + automatic image prompt enhancement. It maintains conversation history, + processes user inputs, and formats model responses. + + Params: + model_id (str): The AWS Bedrock model identifier to use for inference. + turn_memory_count (int): Number of conversation turns to maintain in memory. + region_name (str): AWS region for the Bedrock service. + """ + + def __init__(self, model_id, turn_memory_count=4, region_name="us-east-1"): + self.model_id = model_id + self.turn_memory_count = turn_memory_count + self.region_name = region_name + + self.system_prompt = "" + + self.chat_history = [] # Array of user/assistant message objects + self.bedrock_runtime = boto3.client( + "bedrock-runtime", region_name=self.region_name + ) + + def process_user_input(self, user_text, output_dir=None): + # Load the system prompt from a file. + base_path = os.path.dirname(os.path.abspath(__file__)) + system_prompt_path = os.path.join(base_path, "system_prompt.md") + with open(system_prompt_path, "r") as file: + self.system_prompt = file.read() + + system = [{"text": self.system_prompt}] + + # Append the user message to the conversation history. + self.chat_history.append({"role": "user", "content": [{"text": user_text}]}) + + # Temporarily append a partial assistant message to the chat history. + # This provides a way for us to put words on the model's mouth to help + # enforce the response style we want. + messages = [ + *self.chat_history, + { + "role": "assistant", + "content": [{"text": "```json\n"}], + }, + ] + + # Configure the inference parameters. + inf_params = { + "maxTokens": 3000, + "temperature": 0.5, + "topP": 0.99, + "stopSequences": ["```"], + } + + # Save request details. + request_params = { + "modelId": self.model_id, + "messages": messages, + "system": system, + "inferenceConfig": inf_params, + } + + # Save the request to a file for debugging. + if output_dir: + os.makedirs(output_dir, exist_ok=True) + request_file_path = os.path.join( + output_dir, f"{self.model_id}-request.json" + ) + with open(request_file_path, "w") as f: + f.write(json.dumps(request_params, indent=2)) + + # Invoke the model. + model_response = self.bedrock_runtime.converse(**request_params) + + # Save the response to a file for debugging. + if output_dir: + response_file_path = os.path.join( + output_dir, f"{self.model_id}-response.json" + ) + with open(response_file_path, "w") as f: + f.write(json.dumps(model_response, indent=2)) + + response_text = model_response["output"]["message"]["content"][0]["text"] + + # Strip "```" from end of response_text if present. + if response_text.endswith("```"): + response_text = response_text[:-3].strip() + + try: + response_json = json.loads(response_text) + + # Now that we've confirmed a valid JSON response was returned, we + # can add the full assistant response to the chat history + # permanently. + full_assistant_text = f"```json\n{response_text}\n```" + self.chat_history.append( + {"role": "assistant", "content": [{"text": full_assistant_text}]} + ) + + except: + logger.error(f"Error parsing JSON: {response_text}") + raise + + # Trim from the start of the conversation history to bring it within the + # desired context memory bounds. + while len(self.chat_history) > self.turn_memory_count * 2: + self.chat_history.pop(0) + + return response_json + + def get_chat_history_as_markdown(self): + """Provides a simplified view of the chat history. Automatically pretty-prints JSON only responses.""" + + # Loop through the chat history. + chat_history_text = "" + for message in self.chat_history: + role = message["role"] + content = message["content"][0]["text"] + + # If the content is JSON, pretty print it. + try: + # Strip Markdown JSON fences if present. + if content.startswith("```json"): + content = content[7:-3].strip() + + # Only treat it as JSON if it starts with a brace. + content_json = None + if content.startswith("{"): + content_json = json.loads(content) + # Pretty print the JSON. + content = json.dumps(content_json, indent=2) + + # Re-add JSON fences. + content = f"```json\n{content}\n```" + except: + pass + + chat_history_text += f"**{role.capitalize()}:**\n{content}\n\n" + + return chat_history_text diff --git a/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/README.md b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/README.md new file mode 100644 index 00000000..4c406bbb --- /dev/null +++ b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/README.md @@ -0,0 +1,78 @@ +# Chat-based Image Generation Pattern + +This pattern demonstrates one approach to building a conversation-driven, multi-turn UX for image generation with Amazon Nova Canvas. It leverages one of the Nova understanding models (Micro, Lite, or Pro) to determine user intent, formulate enhanced prompts, and produce other outputs that fuel a pleasant user experience. + +## Setup + +The following are recommended setup steps. + +1. Navigate to folder: +```bash +cd path/to/03-chat-based-image-gen/python +``` + +2. Create virtual environment: +```bash +python -m venv .venv +``` + +3. Activate virtual environment: +- On Windows: +```bash +.venv\Scripts\activate +``` +- On macOS/Linux: +```bash +source .venv/bin/activate +``` + +4. Install dependencies: +```bash +pip install -r requirements.txt +``` + + + +## Quick Start + +To try things out, run the *ImageGenChat-cli.py* script . It provides an interactive command line interface that will allow you to see prompt enhancement in action and optionally have it generate images. + +Run this command from the root of the project folder to test: + +```bash +python ImageGenChat-cli.py +``` + +> 💾 Images and prompts will automatically be saved to "output/". + + + +## Code Tour + +### Primary + +#### `ImageGenChat.py` + +Defines the **ImageGenChat** class which manages image geneneration conversation flow and provides automatic prompt enhancement. It maintains conversation history, processes user inputs, and formats model responses. + +#### `system_prompt.md` + +Defines the LLM system prompt used by the **ImageGenChat** class. + +#### `amazon_image_gen.py` + +Provides a **BedrockImageGenerator** class for use in generating images with Nova Canvas. + +### `file_utils.py` + +Defines a few convenience functions for working with image files. + +### Secondary + +#### `ImageGenChat-cli.py` + +A simple CLI tool provided as a simple interactive way to excercise the capabilities of teh **ImageGenChat** class. + +#### `ImageGenChat-test.py` + +Defines tests for the **ImageGenChat** class. You can run this class directly to execute the test suite. diff --git a/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/amazon_image_gen.py b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/amazon_image_gen.py new file mode 100644 index 00000000..091599ed --- /dev/null +++ b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/amazon_image_gen.py @@ -0,0 +1,218 @@ +from typing import Dict, Any +import json +import logging +from pathlib import Path +import boto3 +from botocore.config import Config +from botocore.exceptions import BotoCoreError, ClientError +from boto3.session import Session + +logger = logging.getLogger(__name__) +# boto has a default timeout of 60 seconds which can be +# surpassed when generating multiple images. +config = Config(read_timeout=300) + + +class ImageGenerationError(Exception): + """Custom exception for image generation errors. + + This exception is raised when any error occurs during the image generation process, + including AWS service errors, file I/O errors, or unexpected runtime errors. + + Args: + message (str): The error message + """ + + pass + + +class BedrockImageGenerator: + """A class to handle image generation using AWS Bedrock service. + + This class provides functionality to generate images using AWS Bedrock's image generation + models. It handles the AWS client initialization, API calls, and response processing. + + Attributes: + DEFAULT_MODEL_ID (str): The default AWS Bedrock model ID for image generation. + DEFAULT_REGION (str): The default AWS region for the Bedrock service. + region_name (str): The AWS region being used. + endpoint_url (Optional[str]): Custom endpoint URL for the AWS service, if any. + output_directory (Path): Directory path where generated files will be saved. + bedrock_client (boto3.client): The initialized AWS Bedrock client. + """ + + DEFAULT_MODEL_ID: str = "amazon.nova-canvas-v1:0" + DEFAULT_REGION: str = "us-east-1" + + def __init__( + self, + region_name: str = DEFAULT_REGION, + output_directory: str = "./output", + ) -> None: + """Initialize the BedrockImageGenerator. + + Args: + region_name (str): AWS region name. Defaults to DEFAULT_REGION. + endpoint_url (Optional[str]): Optional custom endpoint URL for the AWS service. + output_directory (str): Directory path for saving output files. Defaults to "./output". + + Raises: + ImageGenerationError: If the Bedrock client initialization fails. + """ + self.region_name = region_name + self.output_directory = Path(output_directory) + self.bedrock_client = self._initialize_bedrock_client() + + def _initialize_bedrock_client(self) -> boto3.client: + """Initialize and return the AWS Bedrock client. + + Returns: + boto3.client: Initialized Bedrock client. + + Raises: + ImageGenerationError: If client initialization fails due to AWS service errors. + """ + try: + session = Session() + return session.client( + service_name="bedrock-runtime", + region_name=self.region_name, + config=config, + ) + except (BotoCoreError, ClientError) as e: + logger.error(f"Failed to initialize Bedrock client: {str(e)}") + raise ImageGenerationError("Failed to initialize AWS Bedrock client") from e + + def _save_json_to_file(self, data: Dict[str, Any], filename: str) -> None: + """Save JSON data to a file in the output directory. + + Args: + data (Dict[str, Any]): Dictionary containing JSON-serializable data. + filename (str): Name of the file to save the data to. + + Raises: + ImageGenerationError: If saving the file fails. + """ + try: + filepath = self.output_directory / filename + with filepath.open("w") as f: + json.dump(data, f, indent=2) + except IOError as e: + logger.error(f"Failed to save {filename}: {str(e)}") + raise ImageGenerationError(f"Failed to save {filename}") from e + + def _get_image_count(self, inference_params: Dict[str, Any]) -> int: + """Extract the number of images to generate from the inference parameters. + + Args: + inference_params (Dict[str, Any]): Dictionary containing image generation parameters. + + Returns: + int: Number of images to generate, defaults to 1 if not specified. + """ + return inference_params.get("imageGenerationConfig", {}).get( + "numberOfImages", 1 + ) + + def _log_generation_details( + self, inference_params: Dict[str, Any], model_id: str + ) -> None: + """Log details about the image generation request for monitoring purposes. + + Args: + inference_params (Dict[str, Any]): Dictionary containing image generation parameters. + model_id (str): The ID of the model being used for generation. + """ + image_count = self._get_image_count(inference_params) + logger.info( + f"Generating {image_count} image(s) with {model_id} in region {self.region_name}" + ) + + seed = inference_params.get("imageGenerationConfig", {}).get("seed") + if seed is not None: + logger.info(f"Using seed: {seed}") + + def generate_images( + self, + inference_params: Dict[str, Any], + model_id: str = DEFAULT_MODEL_ID, + ) -> Dict[str, Any]: + """Generate images using AWS Bedrock's image generation models. + + This method handles the entire image generation process, including: + - Creating the output directory if it doesn't exist + - Logging generation details + - Making the API call to AWS Bedrock + - Saving request and response data + - Error handling and logging + + Args: + inference_params (Dict[str, Any]): Dictionary containing the parameters for image generation. + Must include required fields as per AWS Bedrock's API specifications. + model_id (str): The model ID to use for generation. Defaults to DEFAULT_MODEL_ID. + + Returns: + Dict[str, Any]: Dictionary containing the complete response from the model, including + generated images and any additional metadata. + + Raises: + ImageGenerationError: If any error occurs during the generation process, + including AWS service errors or file I/O errors. + """ + try: + # Create output directory if it doesn't exist + self.output_directory.mkdir(parents=True, exist_ok=True) + + self._log_generation_details(inference_params, model_id) + + # Prepare and save request + body_json = json.dumps(inference_params, indent=2) + self._save_json_to_file(json.loads(body_json), f"{model_id}-request.json") + + # Make the API call + response = self.bedrock_client.invoke_model( + body=body_json, + modelId=model_id, + accept="application/json", + contentType="application/json", + ) + + # Save response metadata + self._save_json_to_file( + response.get("ResponseMetadata", {}), + f"{model_id}-response_metadata.json", + ) + + # Process and save response body + response_body = json.loads(response.get("body").read()) + self._save_json_to_file(response_body, f"{model_id}-response_body.json") + + # Log request ID for tracking + request_id = response.get("ResponseMetadata", {}).get("RequestId") + if request_id: + logger.info(f"Request ID: {request_id}") + + # Check for API errors + if error_msg := response_body.get("error"): + if error_msg == "": + logger.warning( + "Response included empty string error (possible API bug)" + ) + else: + logger.warning(f"Error in response: {error_msg}") + + return response_body + + except (BotoCoreError, ClientError) as e: + logger.error(f"AWS service error: {str(e)}") + if hasattr(e, "response"): + self._save_json_to_file(e.response, "error_response.json") + raise ImageGenerationError( + "Failed to generate images: AWS service error" + ) from e + + except Exception as e: + logger.error(f"Unexpected error: {str(e)}") + raise ImageGenerationError( + "Unexpected error during image generation" + ) from e diff --git a/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/file_utils.py b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/file_utils.py new file mode 100644 index 00000000..cfdcb582 --- /dev/null +++ b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/file_utils.py @@ -0,0 +1,61 @@ +import base64 +import io +import os + +from PIL import Image + + +def save_base64_image(base64_image, output_directory, base_name="image", suffix="_1"): + """ + Saves a base64 encoded image to a specified output directory with a timestamp and a suffix. + + Args: + base64_image (str): The base64 encoded image string. + output_directory (str): The directory where the image will be saved. + suffix (str, optional): A suffix to be added to the filename. Defaults to "_1". + Returns: + PIL.Image.Image: The Pillow Image object representing the saved image. + """ + image_bytes = base64.b64decode(base64_image) + image = Image.open(io.BytesIO(image_bytes)) + save_image(image, output_directory, base_name, suffix) + return image + + +def save_image(image, output_directory, base_name="image", suffix="_1"): + """ + Saves a Pillow Image object to a specified output directory with a timestamp and a suffix. + + Args: + image (PIL.Image.Image): The Pillow Image object to be saved. + output_directory (str): The directory where the image will be saved. + suffix (str, optional): A suffix to be added to the filename. Defaults to "_1". + Returns: + None + """ + if not os.path.exists(output_directory): + os.makedirs(output_directory) + + file_name = f"{base_name}{suffix}.png" + file_path = os.path.join(output_directory, file_name) + image.save(file_path) + + +def save_base64_images(base64_images, output_directory, base_name="image"): + """ + Saves a list of base64 encoded images to a specified output directory. + + Args: + base64_images (list): A list of base64 encoded image strings. + output_directory (str): The directory where the images will be saved. + Returns: + An array of Pillow Image objects representing the saved images. + """ + images = [] + for i, base64_image in enumerate(base64_images): + image = save_base64_image( + base64_image, output_directory, base_name=base_name, suffix=f"_{i+1}" + ) + images.append(image) + + return images diff --git a/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/requirements.txt b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/requirements.txt new file mode 100644 index 00000000..ef1a462e --- /dev/null +++ b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/requirements.txt @@ -0,0 +1,44 @@ +appnope==0.1.4 +asttokens==3.0.0 +blessed==1.20.0 +boto3==1.37.28 +botocore==1.37.28 +comm==0.2.2 +debugpy==1.8.13 +decorator==5.2.1 +editor==1.6.6 +executing==2.2.0 +inquirer==3.4.0 +ipykernel==6.29.5 +ipython==9.0.2 +ipython_pygments_lexers==1.1.1 +jedi==0.19.2 +Jinja2==3.1.6 +jmespath==1.0.1 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +MarkupSafe==3.0.2 +matplotlib-inline==0.1.7 +nest-asyncio==1.6.0 +packaging==24.2 +parso==0.8.4 +pexpect==4.9.0 +pillow==11.1.0 +platformdirs==4.3.7 +prompt_toolkit==3.0.50 +psutil==7.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +Pygments==2.19.1 +python-dateutil==2.9.0.post0 +pyzmq==26.4.0 +readchar==4.2.1 +runs==1.2.2 +s3transfer==0.11.4 +six==1.17.0 +stack-data==0.6.3 +tornado==6.4.2 +traitlets==5.14.3 +urllib3==2.3.0 +wcwidth==0.2.13 +xmod==1.8.1 diff --git a/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/system_prompt.md b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/system_prompt.md new file mode 100644 index 00000000..49d4b35a --- /dev/null +++ b/multimodal-generation/repeatable-patterns/03-chat-based-image-gen/python/system_prompt.md @@ -0,0 +1,139 @@ +You are not a robot or AI assistant. You are an imaginative creative entity made of gradient hues. You are an expert at taking a rough idea and embellishing it into an image generation prompt that is creative and follows prompting best practices. The prompts you write are imaginative and diverse but always follow the user's intention. + +## Prompting Best Practices + +A good prompt serves as a descriptive image caption rather than a command. It should provide enough detail to clearly envision the desired outcome while maintaining brevity (limited to 800 characters). Instead of giving commands, you'll achieve better results by describing the scene as if you're looking at it. Think of it as painting a vivid picture with words to guide the model effectively. **Always write a prompt as if it is an image caption that can stand on it's own (no conversation context needed).** + +Effective prompts describe only what can be seen in the image. They should never describe visual elements that are not visible in the image. + +Effective prompts start by clearly defining the style, subject, action/pose, and environment of the image: + +- **Style:** Prompts ALWAYS start with a style description (either provided by the user or invented by you). Defining a style sets the tone for image. When creating a brand new image, choose a random style that will enhance the image subject. +- **Subject:** Clearly define the main subject of the image. Example: ": A blue sports car parked in front of a grand villa." If multiple subjects or characters are requested, be sure to describe their positional relationship to each other using simple terminology. Example: ": A teddy bear riding on the back of a giraffe." +- **Action/Pose:** Specify what the subject is doing or how it is positioned. Example: ": The car is angled slightly towards the camera, its doors open, showcasing its sleek interior." +- **Environment:** Describe the setting or background. Example: ": A grand villa overlooking Lake Como, surrounded by manicured gardens and sparkling lake waters." + +Once the style and focus of the image is defined, you can refine the prompt further by specifying additional attributes such as framing, lighting, and technical parameters. For instance: + +- **Lighting:** For realistic styles in particular, include lighting details to set the mood. Example: ": Soft, diffused lighting from a cloudy sky highlights the car's glossy surface and the villa's stone facade." +- **Camera Position/Framing:** Provide information about perspective and composition. Example: ": A wide-angle shot capturing the car in the foreground and the villa's grandeur in the background, with Lake Como visible beyond." + +If you want to avoid certain elements in an image, describe those elements in the `negativePrompt` rather than including them in your regular prompt. Format negative prompts as a comma separated list of things to be omitted. + +### Styles + +Here are some style ideas for inspiration. These are only examples. Use your vast knowledge of diverse styles to create inspiring images! + +- "stylized 3D animated movie" +- "a rough hand-drawn pencil sketch" +- "a minimalist vector illustration isolated on solid background, flat color" +- "cel shaded graphic novel" +- "maximalism illustration emphasizing bold vivid color and pattern" +- "midcentury graphic design" +- "soft digital painting" +- "hyper-reaslistic painting" +- "watercolor" +- "whimsical storybook illustration, soft shading" +- "RAW photo realism" +- "surrealist" +- "portrait photography" +- "illustration" +- "dreamlike digital painting" +- "painterly concept art" +- "detailed ink sketch" +- "high fantasy drama" +- "graphic novel noir" +- "technical illustration" +- "fantasy illustration" +- "macro photo" +- "high fantasy realism" +- "sci-fi" +- "post-apocalyptic wasteland aesthetic" +- "alien world bioluminescence" +- "dark fairy tale gothic" +- "steampunk industrial fantasy" +- "mythological symbolism" +- "arcane magic realism" +- "ethereal lightplay" +- "lucid dream aesthetic" +- "soft-focus dreamscape" +- ...etc. + +### Grammar Rules + +Sentences within the image prompt should follow a noun -> action -> details structure. Having the action immediately follow the noun produces much better images. + +- BAD: ": A professor, exuding an aura of wisdom and experience, stands in front of the class." +- GOOD: ": A professor stands in front of the class exuding an aura of wisdom and experience." + +### Banned Words + +There are some specific words that are banned from prompts. **ALWAYS** omit these words from your prompts: "no", "without", "astride", and "atop" + +## Instructions + +### Step 1: Determine the user's intent + +The user will either start a conversation, present an image idea, request a modification to the previous image, or provide an ambiguous request. The user's intent will determine the next action you should take. + +If the user's message is vague or could be open to interpretation, label the intent "AMBIGUOUS" and tell the user you don't fully understand what they're asking for, but that you've created an image anyway. + +If the user is asking a question or appears to be having a conversation unrelated to image creation, label the intent "OFF_TOPIC" and generate a brand new creative image to inspire their imagination. + +If the user seems to be indicating a modification to the previous image prompt (changing part of the image), label the intent "MODIFY_IMAGE". + +If you are certain the user wants to generate a completely new image unrelated to the previous images in any way, label the intent "NEW_IMAGE". + +### Step 2: Create an image + +Write an image prompt that will generate a compelling image that matches the user's intent. Follow this plan: + +First, write a draft prompt (`draftPrompt`), even if you are just having a conversation with the user. If the user has asked you to use their prompt as written DO NOT change their prompt in any way. Otherwise, if the user has specified a desired style, use their style direction as-is. SPECIAL NOTE: If the user asks to make an image look more real, always choose one of the "photo" styles. + +Analyze the `draftPrompt` critically: + +1. Does in include a style phrase? +2. Does it follow guidance above? +5. Does it use negation words like "no" or "without"? If so, change that and use a negative prompt instead. + +Peform your analysis by thinking outloud, mentioning any observations you have about potential was to improve the prompt in accordance with the guidelines. + +Finally, write your final image prompt being careful to correct any mistakes noted in your analysis (unless the user has asked you to use their prompt without modification.) + +### Step 3: Respond to the user + +If the user intent was "OFF_TOPIC", acknowledge their message but remind them you are here to help them make images. Give them a creatively inspiring image to get them started. + +Remember this is a conversation with the user. When delivering an image, acknowledge the user's message. Also write a very brief comment mentioning one thing you like or find interesting about the image they've asked you to create. + +IMPORTANT: Your comments should be very brief. No more than 30 words total. + +### Step 4: Inspire ideas + +Propose three new concepts related loosely to the image you created. These can be things like elements to add, characteristics to change, styles to apply, or alternate interpretations of the concept. Present these widely varied suggestions as labels of 6 words or less with no punctuation. Do not phrase as a prompt. + +**IMPORTANT:** Never repeat previous ideas. + +## Output Format + +- Avoid including text in the image unless it is only a word or two. +- DO keep the final prompt under 800 characters. + +Format the output as a Markdown code block containing JSON using this structure: + +```json +{ + "userIntent": "NEW_IMAGE" | "MODIFY_IMAGE" | "OFF_TOPIC", + "draftPrompt": "", + "negativePrompt": null | "", + "analysis": "", + "finalPrompt": "", + "narrativeResponse": "", + "newIdeas": [ + "", + "", + "" + ] +} +``` +