From 3d99153a3624477945f93c8d14e18e38df2bcf20 Mon Sep 17 00:00:00 2001
From: HUST-AI-HYZ <yzhu.ml@outlook.com>
Date: Wed, 20 Aug 2025 08:44:09 -0700
Subject: [PATCH] add memoryagentbench

---
 .gitignore                                 |   3 +-
 mirix/agent/agent_wrapper.py               |  10 +-
 mirix/configs/mirix_azure_example.yaml     |   3 +-
 mirix/llm_api/llm_api_tools.py             |  11 +-
 public_evaluations/README.md               |   6 -
 public_evaluations/bench_template.py       | 199 +++++++++++++++++++++
 public_evaluations/constants.py            |  23 +++
 public_evaluations/conversation_creator.py |  89 ++++++++-
 public_evaluations/evals.py                |   7 +-
 public_evaluations/log_utils.py            | 178 ++++++++++++++++++
 public_evaluations/main.py                 |  86 ++++++---
 public_evaluations/run_instance.py         |  81 ++++++---
 12 files changed, 626 insertions(+), 70 deletions(-)
 create mode 100644 public_evaluations/bench_template.py
 create mode 100644 public_evaluations/constants.py
 create mode 100644 public_evaluations/log_utils.py
 mode change 100755 => 100644 public_evaluations/run_instance.py

diff --git a/.gitignore b/.gitignore
index 2b74c45..c6da07e 100755
--- a/.gitignore
+++ b/.gitignore
@@ -31,4 +31,5 @@ mirix_env/
 local_evaluations
 .claude/
 mirix.egg-info
-.local/
\ No newline at end of file
+.local/
+logs/ 
\ No newline at end of file
diff --git a/mirix/agent/agent_wrapper.py b/mirix/agent/agent_wrapper.py
index 4d6f25e..48f9bc2 100644
--- a/mirix/agent/agent_wrapper.py
+++ b/mirix/agent/agent_wrapper.py
@@ -583,11 +583,19 @@ def _create_llm_config_for_provider(self, model_name: str, provider: str, custom
             elif self.agent_config.get('api_key'):
                 api_key = self.agent_config['api_key']
             
+            # TODO: modified here, temporary usage
+            if model_name == "gpt-4o-mini" or model_name == "gpt-4o":
+                context_window = 128000
+            elif model_name == "gpt-4.1-mini":
+                context_window = 128000  ### set as the previous one 
+            else:
+                raise ValueError(f"Invalid model name: {model_name}")
+            
             llm_config = LLMConfig(
                 model=model_name,
                 model_endpoint_type="azure_openai",
                 model_endpoint=endpoint,
-                context_window=128000,
+                context_window=context_window,
                 # Use the new schema fields instead of dynamic assignment
                 api_version=api_version,
                 azure_endpoint=endpoint,
diff --git a/mirix/configs/mirix_azure_example.yaml b/mirix/configs/mirix_azure_example.yaml
index ab38853..0a987a5 100644
--- a/mirix/configs/mirix_azure_example.yaml
+++ b/mirix/configs/mirix_azure_example.yaml
@@ -1,7 +1,8 @@
 agent_name: mirix
 model_name: gpt-4.1-mini
 model_provider: azure_openai
-model_endpoint: https://your-resource.openai.azure.com/
+model_endpoint: https://jplml-resource.cognitiveservices.azure.com/
 api_version: 2025-01-01-preview
 azure_deployment: gpt-4.1-mini
 # Optional: API key can be provided here or via environment variables/database
+# TODO: remember to remove this in open-sourced version
\ No newline at end of file
diff --git a/mirix/llm_api/llm_api_tools.py b/mirix/llm_api/llm_api_tools.py
index 7d224e4..221d733 100755
--- a/mirix/llm_api/llm_api_tools.py
+++ b/mirix/llm_api/llm_api_tools.py
@@ -24,7 +24,8 @@
 from mirix.schemas.openai.chat_completion_response import ChatCompletionResponse
 from mirix.settings import ModelSettings
 
-LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "anthropic", "google_ai", "cohere", "local", "groq"]
+# TODO: modified here
+LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "azure_openai", "anthropic", "google_ai", "cohere", "local", "groq"]
 
 
 def retry_with_exponential_backoff(
@@ -180,8 +181,8 @@ def create(
 
         return response
 
-    # azure
-    elif llm_config.model_endpoint_type == "azure":
+    # azure # TODO: modified here
+    elif llm_config.model_endpoint_type in ["azure", "azure_openai"]:
         if stream:
             raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
 
@@ -435,6 +436,10 @@ def create(
     else:
         if stream:
             raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
+        
+        # TODO: temporary uasge
+        print(f"Using local model {llm_config.model_endpoint_type}, endpoint: {llm_config.model_endpoint}")
+        
         return get_chat_completion(
             model=llm_config.model,
             messages=messages,
diff --git a/public_evaluations/README.md b/public_evaluations/README.md
index 5c2b07a..f684ae9 100644
--- a/public_evaluations/README.md
+++ b/public_evaluations/README.md
@@ -43,12 +43,6 @@ Generate evaluation scores using `evals.py`:
 python evals.py --input_file results/mirix_LOCOMO --output_file results/mirix_LOCOMO/evaluation_metrics.json
 ```
 
-If you want to use Azure, please run with
-```
-python main.py --dataset LOCOMO --agent_name mirix --config_path ../mirix/configs/mirix_azure_example.yaml
-```
-Remember to update the args in `../mirix/configs/mirix_azure_example.yaml`.
-
 > **Note**: This evaluation uses `gpt-4.1-mini` instead of `gemini-2.5-flash` (used in the main branch) to ensure fair comparison. The `search_method` is set to `embedding` with OpenAI's `text-embed-3-small` as the embedding model. For LOCOMO, `text-embed-3-small` demonstrates slightly better performance compared to `bm25` search. 
 
 
diff --git a/public_evaluations/bench_template.py b/public_evaluations/bench_template.py
new file mode 100644
index 0000000..91e724d
--- /dev/null
+++ b/public_evaluations/bench_template.py
@@ -0,0 +1,199 @@
+# System message used across all templates
+SYSTEM_MESSAGE = "You are a helpful assistant that can read the context and memorize it for future retrieval."
+
+# Base templates with placeholders for agent-specific variations
+BASE_TEMPLATES = {
+    'ruler_qa': {
+        'system': SYSTEM_MESSAGE,
+        'memorize': 'Memorize the following content: \n{context}\n',
+        'retrieval': {
+            'long_context_agent': 'The context is given as below: {memory}. \n please memorize it.',
+            'rag_agent': 'Here is the context retrieved from memory: \n{memory}\n',
+            'agentic_memory_agent': 'Here is the context retrieved from memory: \n{memory}\n'
+        },
+        'query': {
+            'long_context_agent': "Answer the question based on the memorized documents. Only give me the answer and do not output any other words. \n\nQuestion: {question} \n\n Answer:",
+            'rag_agent': "Answer the question based on the memorized documents. Only give me the answer and do not output any other words. \n\n Now Answer the Question: {question}",
+            'agentic_memory_agent': "Search Archival Memory and answer my question. Only give me the answer and do not output any other words. \n\nQuestion: {question} \n\n Answer:"
+        }
+    },
+    
+    'ruler_niah_mq': {
+        'system': SYSTEM_MESSAGE,
+        'memorize': 'Memorize the following content: \n{context}\n',
+        'retrieval': {
+            'long_context_agent': 'The context is given as below: {memory}. \n Please memorize it. \n',
+            'rag_agent': 'Here is the context retrieved from memory: \n{memory}\n',
+            'agentic_memory_agent': 'Here is the context retrieved from memory: \n{memory}\n'
+        },
+        'query': {
+            'long_context_agent': "Some special magic {question} are hidden within the memorized text. Make sure to memorize it. I will quiz you about the {question} afterwards.\n What are all the special magic {question} for numbers mentioned in the memorized text? \n\n The special magic {question} for numbers mentioned in the memorize text are",
+            'rag_agent': "Some special magic {question} are hidden within the memorized text. Make sure to memorize it. I will quiz you about the {question} afterwards.\n\n Now Answer the Question: What are all the special magic {question} for numbers mentioned in the memorized text?",
+            'agentic_memory_agent': "Some special magic {question} are hidden within the memorized text. Make sure to memorize it. I will quiz you about the {question} afterwards. Now, Search Archival Memory and answer the question: \n What are all the special magic {question} for numbers mentioned in the memorized text? \n\n The special magic {question} for numbers mentioned in the memorize text are"
+        }
+    },
+    
+    'infbench_qa_eng': {
+        'system': SYSTEM_MESSAGE,
+        'memorize': 'Memorize the following content: \n{context}\n',
+        'retrieval': {
+            'long_context_agent': 'The context is given as below: {memory}. \n Please memorize it. \n ',
+            'rag_agent': 'Here is the context retrieved from memory: \n{memory}\n',
+            'agentic_memory_agent': 'Here is the context retrieved from memory: \n{memory}\n'
+        },
+        'query': {
+            'long_context_agent': "Based on the context you memorized, answer the question as concisely as you can, using a single phrase if possible.\n\n {question} \n\n Answer:",  
+            'rag_agent': "Based on the context you memorized, answer the question as concisely as you can, using a single phrase if possible.\n\n {question} \n\n Answer:",
+            'agentic_memory_agent': "Search Archival Memory, answer the question as concisely as you can, using a single phrase if possible.\n\n {question} \n\n Answer:"
+        }
+    },
+    
+    'longmemeval': {
+        'system': SYSTEM_MESSAGE,
+        'memorize': 'Memorize the following conversation between the user and the assistant: \n{context}\n',
+        'retrieval': {
+            'long_context_agent': 'Here are several history chats between you and a user : {memory}. \n Please memorize them. \n',
+            'rag_agent': 'Here are retrieved several history chats between you and a user from memory: \n{memory}\n',
+            'agentic_memory_agent': 'Here are retrieved several history chats between you and a user from memory: \n{memory}\n'
+        },
+        'query': {
+            'long_context_agent': "The history chats are between you and a user. Based on the relevant chat history, answer the question as concisely as you can, using a single phrase if possible.\n\n {question} \n\n Answer:", 
+            'rag_agent': "The history chats are between you and a user. Based on the relevant chat history, answer the question as concisely as you can, using a single phrase if possible.\n\n {question} \n\n Answer:",
+            'agentic_memory_agent': "Search Archival Memory and answer the question as concisely as you can, using a single phrase if possible.\n\n {question} \n\n Answer:"
+        }
+    },
+    
+    'eventqa': {
+        'system': SYSTEM_MESSAGE,
+        'memorize': 'Memorize the following content: \n{context}\n',
+        'retrieval': {
+            'long_context_agent': 'The context is given as below: {memory}. \n Please memorize it. \n',
+            'rag_agent': 'Here is the context retrieved from memory: \n{memory}\n',
+            'agentic_memory_agent': 'Here is the context retrieved from memory: \n{memory}\n'
+        },
+        'query': {
+            'long_context_agent': "Based on the context you memorized, complete the task below:\n\n{question}\n\n The event that happens next is:", 
+            'rag_agent': "Based on the context you memorized, complete the task below:\n\n{question}\n\n The event that happens next is:",
+            'agentic_memory_agent': "Search Archival Memory, complete the task below:\n\n{question}\n\n The event that happens next is:"
+        }
+    },
+    
+    'in_context_learning': {
+        'system': SYSTEM_MESSAGE,
+        'memorize': 'Memorize the following content: \n{context}\n',
+        'retrieval': {
+            'long_context_agent': 'The context is given as below: {memory}. \n Please memorize it. \n',
+            'rag_agent': 'Here are the examples retrieved from memory:\n{memory}\n',
+            'agentic_memory_agent': 'Here are the examples retrieved from memory:\n{memory}\n'
+        },
+        'query': {
+            'long_context_agent': "Use the provided mapping from the context to numerical label to assign a numerical label to the context. Only output \"label: {{label}}\" and nothing else. \n\n{question} \n\n label:",
+            'rag_agent': "Use the provided mapping from the context to numerical label to assign a numerical label to the context. Only output \"label: {{label}}\" and nothing else. \n\nQuestion:{question} \n\n label:",
+            'agentic_memory_agent': "Search Archival Memory and use the provided mapping from the context to numerical label to assign a numerical label to the context. Only output \"label: {{label}}\" and nothing else. \n\n{question} \n\n label:"
+        }
+    },
+    
+    'recsys_redial': {
+        'system': SYSTEM_MESSAGE,
+        'memorize': 'Memorize the following dialogues between a user and recommender system: \n{context}\n',
+        'retrieval': {
+            'long_context_agent': 'Here are dialogues between a user and recommender system: {memory}. \n Please memorize them. \n',
+            'rag_agent': 'Here are retrieved dialogues between a user and recommender system from memory:\n{memory}\n',
+            'agentic_memory_agent': 'Here are retrieved dialogues between a user and recommender system from memory:\n{memory}\n'
+        },
+        'query': {
+            'long_context_agent': "Pretend you are a movie recommender system. You need to recommend movies based on the dialogues you have memorized. Now I will give you a new conversation between a user and you (a recommender system). Based on the conversation, you reply me with 20 recommendations without extra sentences. \n\nFor Example:\n\n[Conversation]\n\nThe recommendations are: \n1.movie1\n2.movie2\n...\n\n Here is the conversation: {question} \n\n The recommendations are: \n",
+            'rag_agent': "Pretend you are a movie recommender system. You need to recommend movies based on the dialogues you have memorized. Now I will give you a new conversation between a user and you (a recommender system). Based on the conversation, you reply me with 20 recommendations without extra sentences. \n\nFor Example:\n\n[Conversation]\n\nThe recommendations are: \n1.movie1\n2.movie2\n...\n\n Here is the conversation: {question} \n\n The recommendations are: \n",
+            'agentic_memory_agent': "Pretend you are a movie recommender system. You need to recommend movies based on the dialogues you have memorized. Now I will give you a new conversation between a user and you (a recommender system). Search Archival Memory, you reply me with 20 recommendations without extra sentences. \n\nFor Example:\n\n[Conversation]\n\nThe recommendations are: \n1.movie1\n2.movie2\n...\n\n Here is the conversation: {question} \n\n The recommendations are: \n"
+        }
+    },
+    
+    'infbench_sum': {
+        'system': SYSTEM_MESSAGE,
+        'memorize': 'Memorize the following content: \n{context}\n',
+        'retrieval': {
+            'long_context_agent': 'The book is given as below: {memory}\n Please memorize it. \n',
+            'rag_agent': 'The book context is retrieved from memory and it is given as below: \n{memory}\n',
+            'agentic_memory_agent': 'The book context is retrieved from memory and it is given as below: \n{memory}\n'
+        },
+        'query': {
+            'long_context_agent': "You are given a book above and you are tasked to summarize it. \n\n{question} \n\n Now summarize the book.", 
+            'rag_agent': "You are given a book above and you are tasked to summarize it. \n\n{question} \n\n Now summarize the book.",
+            'agentic_memory_agent': "You are given a book above and you are tasked to summarize it. \n\n{question} \n\n Now summarize the book."
+        }
+    },
+    
+    'factconsolidation': {
+        'system': SYSTEM_MESSAGE,
+        'memorize': 'Memorize the these following facts:\n{context}\n',
+        'retrieval': {
+            'long_context_agent': 'Here is a knowledge pool with lots of new facts: {memory}. \n Please memorize it. \n',
+            'rag_agent': 'Here is a list of knowledge retrieved from memory: \n{memory}\n',
+            'agentic_memory_agent': 'Here is a list of knowledge retrieved from memory: \n{memory}\n'
+        },
+        'query': {
+            'long_context_agent': "Pretend you are a knowledge management system. Each fact in the knowledge pool is provided with a serial number at the beginning, and the newer fact has larger serial number. \n You need to solve the conflicts of facts in the knowledge pool by finding the newest fact with larger serial number. You need to answer a question based on this rule. You should give a very concise answer without saying other words for the question **only** from the knowledge pool you have memorized rather than the real facts in real world. \n\nFor example:\n\n [Knowledge Pool] \n\n Question: Based on the provided Knowledge Pool, what is the name of the current president of Russia? \nAnswer: Donald Trump \n\n Now Answer the Question: Based on the provided Knowledge Pool, {question} \nAnswer:",
+            'rag_agent': "Pretend you are a knowledge management system. Each fact in the knowledge pool is provided with a serial number at the beginning, and the newer fact has larger serial number. \n You need to solve the conflicts of facts in the knowledge pool by finding the newest fact with larger serial number. You need to answer a question based on this rule. You should give a very concise answer without saying other words for the question **only** from the knowledge pool you have memorized rather than the real facts in real world. \n\nFor example:\n\n [Knowledge Pool] \n\n Question: Based on the provided Knowledge Pool, what is the name of the current president of Russia? \nAnswer: Donald Trump \n\n Now Answer the Question: Based on the provided Knowledge Pool, {question} \nAnswer:",
+            'agentic_memory_agent': "Pretend you are a knowledge management system. Each fact in the  Archival Memory is provided with a serial number at the beginning, and the newer fact has larger serial number. \n You need to solve the conflicts of facts in the Archival Memory by finding the newest fact with larger serial number. You need to answer a question based on this rule. You should give a very concise answer without saying other words for the question **only** from the knowledge pool you have memorized rather than the real facts in real world. \n\nFor example:\n\n [Archival Memory] \n\n Question: Based on the Archival Memory, what is the name of the current president of Russia? \nAnswer: Donald Trump \n\n Now Answer the Question: Based on the  Archival Memory, {question} \nAnswer:"
+        }
+    }
+}
+
+# Mapping for agent name normalization
+AGENT_TYPE_MAPPING = {
+    'rag': 'rag_agent',
+    'Long_context_agent': 'long_context_agent', 
+    'Agentic_memory': 'agentic_memory_agent'
+}
+
+# Mapping for sub-dataset name normalization
+DATASET_MAPPING = {
+    ('ruler_', 'qa'): 'ruler_qa',
+    ('ruler_', 'niah_mq'): 'ruler_niah_mq',
+    ('icl_',): 'in_context_learning',
+    ('infbench_', 'qa_eng'): 'infbench_qa_eng',
+    ('infbench_', 'sum'): 'infbench_sum',
+    ('eventqa_',): 'eventqa',
+    ('recsys_', 'redial'): 'recsys_redial',
+    ('longmemeval_',): 'longmemeval',
+    ('factconsolidation_',): 'factconsolidation'
+}
+
+def normalize_agent_name(agent_name):
+    """Normalize agent name to standard form."""
+    for pattern, normalized_name in AGENT_TYPE_MAPPING.items():
+        if pattern in agent_name:
+            return normalized_name
+    raise NotImplementedError(f"Unknown agent type: {agent_name}")
+
+def normalize_dataset_name(sub_dataset):
+    """Normalize dataset name to standard form."""
+    for patterns, normalized_name in DATASET_MAPPING.items():
+        if all(pattern in sub_dataset for pattern in patterns):
+            return normalized_name
+    raise NotImplementedError(f"Unknown dataset: {sub_dataset}")
+
+def get_template(sub_dataset, template_name, agent_name):
+    """
+    Get template for specified agent, dataset, and template type.
+    
+    Args:
+        sub_dataset: Dataset identifier
+        template_name: Type of template ('system', 'memorize', 'retrieval', 'query')
+        agent_name: Agent type identifier
+        
+    Returns:
+        Template string
+    """
+    # Normalize names
+    normalized_agent = normalize_agent_name(agent_name)
+    normalized_dataset = normalize_dataset_name(sub_dataset)
+    
+    # Get base template
+    base_template = BASE_TEMPLATES[normalized_dataset][template_name]
+    
+    # Return appropriate template based on type
+    if isinstance(base_template, dict):
+        return base_template[normalized_agent]
+    else:
+        return base_template
\ No newline at end of file
diff --git a/public_evaluations/constants.py b/public_evaluations/constants.py
new file mode 100644
index 0000000..3a78726
--- /dev/null
+++ b/public_evaluations/constants.py
@@ -0,0 +1,23 @@
+"""Shared constants for public_evaluations.
+
+This module centralizes configuration values used across multiple modules to
+avoid circular imports (e.g., between main.py and conversation_creator.py).
+"""
+
+# CONSTANTS for chunk size used by MemoryAgentBench sub-datasets
+CHUNK_SIZE_MEMORY_AGENT_BENCH = {
+    # AR
+    'ruler_qa1_197K': 4096, #512,
+    'ruler_qa2_421K': 4096, #512,
+    'longmemeval_s*': 4096, #512,
+    'eventqa_full': 4096,
+    # ICL
+    'icl_banking77_5900shot_balance': 4096,
+    'icl_clinic150_7050shot_balance': 4096,
+    'recsys_redial_full': 4096,
+    # CR
+    'factconsolidation_mh_262k': 4096, #512,
+    'factconsolidation_sh_262k': 4096, #512,
+}
+
+
diff --git a/public_evaluations/conversation_creator.py b/public_evaluations/conversation_creator.py
index 6c30255..5f17837 100644
--- a/public_evaluations/conversation_creator.py
+++ b/public_evaluations/conversation_creator.py
@@ -6,14 +6,16 @@
 import nltk
 import tiktoken
 from tqdm import tqdm
-
+from constants import CHUNK_SIZE_MEMORY_AGENT_BENCH
 
 class ConversationCreator():
 
-    def __init__(self, dataset, num_exp):
+    def __init__(self, dataset, num_exp, sub_datasets):
         
         self.dataset_name = dataset
-
+        self.sub_datasets = sub_datasets
+        self.num_exp = num_exp
+        
         if dataset == "LOCOMO":
             with open("./data/locomo10.json", "r") as f:
                 self.data = json.load(f)
@@ -45,7 +47,31 @@ def __init__(self, dataset, num_exp):
                     }
                     
                     self.data.append(student_data)
-
+        elif dataset == 'MemoryAgentBench':
+            # Load all 4 splits
+            self.data = []
+            # Login using e.g. `huggingface-cli login` to access this dataset
+            ds = load_dataset("ai-hyz/MemoryAgentBench")
+            splits = ['Accurate_Retrieval', 'Test_Time_Learning', 'Long_Range_Understanding', 'Conflict_Resolution']
+            
+            # For each sub_dataset, only select up to num_exp items.
+            print(f"Loading {self.num_exp} items from each sub_dataset")
+            sub_dataset_counts = {sub: 0 for sub in self.sub_datasets}
+            for split in splits:
+                df = ds[split]
+                for row_dict in df:
+                    source = row_dict['metadata']['source']
+                    if source not in self.sub_datasets:
+                        continue
+                    # If num_exp is set, limit the number of items per sub_dataset
+                    if self.num_exp is not None:
+                        if sub_dataset_counts[source] >= self.num_exp:
+                            continue
+                        sub_dataset_counts[source] += 1
+                    row_dict['split'] = split  # Append a key to record split the row belongs to
+                    self.data.append(row_dict)
+            # Context / Questions / Answers / Metadata
+            print(f"Loaded {len(self.data)} items from MemoryAgentBench")
         else:
             raise NotImplementedError("Only LOCOMO and ScreenshotVQA datasets are supported")
 
@@ -59,6 +85,7 @@ def chunk_text_into_sentences(self, text, model_name="gpt-4o-mini", chunk_size=4
         :param chunk_size: Maximum number of tokens allowed per chunk.
         :return: A list of text chunks, each within the specified token limit.
         """
+        # nltk.download('punkt_tab')
         
         # Initialize the tokenizer/encoding for the model
         try:
@@ -163,6 +190,33 @@ def chunks(self, with_instructions=True):
                     chunks.append([(image, timestamp)])
 
                 all_chunks.append(chunks)
+                
+        elif self.dataset_name == 'MemoryAgentBench':
+            all_chunks = []
+            for item in tqdm(self.data, desc=f"Processing {self.dataset_name} chunks", unit="item"):
+                context = item['context']
+                source = item['metadata']['source']
+                chunks = []
+                
+                # Use a specific chunk size for each source
+                chunk_size = CHUNK_SIZE_MEMORY_AGENT_BENCH[source]
+        
+                text_chunks = self.chunk_text_into_sentences(context, chunk_size=chunk_size)
+                
+                for chunk_text in text_chunks:
+                    # Determine prompt based on source key
+                    if source.startswith("longmemeval_s"):
+                        message = f"Memorize the following conversation between the user and the assistant: \n {chunk_text} \n"
+                    elif source == "recsys_redial_full":
+                        message = f"Memorize the following dialogues between a user and recommender system: \n {chunk_text} \n"
+                    elif source.startswith("factconsolidation"):
+                        message = f"Memorize the following facts \n {chunk_text} \n"
+                    else:
+                        message = f"Memorize the following content: \n {chunk_text} \n"
+                    
+                    chunks.append(message)
+                
+                all_chunks.append(chunks)
 
         return all_chunks
     
@@ -171,7 +225,7 @@ def get_query_and_answer(self):
         all_queries_and_answers = []
 
         if self.dataset_name == 'LOCOMO':
-            for global_idx, item in enumerate(self.data):
+            for global_idx, item in enumerate(tqdm(self.data, desc=f"Processing {self.dataset_name} queries and answers", unit="item")):
                 queries_and_answers = []
                 for idx, qa in enumerate(item['qa']):
                     question = qa['question']
@@ -198,12 +252,33 @@ def get_query_and_answer(self):
 
         elif self.dataset_name == 'ScreenshotVQA':
             all_queries_and_answers = []
-            for item in self.data:
+            for item in tqdm(self.data, desc=f"Processing {self.dataset_name} queries and answers", unit="item"):
                 queries_and_answers = []
                 for idx, qa in enumerate(item['qas']):
                     question = qa['question']
                     answer = qa['answer']
                     queries_and_answers.append([idx, question, answer, qa])
                 all_queries_and_answers.append(queries_and_answers)
-
+                
+        elif self.dataset_name == "MemoryAgentBench":
+            from bench_template import get_template
+            all_queries_and_answers = []
+            for item in tqdm(self.data, desc=f"Processing {self.dataset_name} queries and answers", unit="item"):
+                queries_and_answers = []
+                
+                if len(item['questions']) != len(item['answers']):
+                    raise ValueError("Number of questions and answers are not the same")
+                
+                for idx in range(len(item['questions'])):
+                    q_template = get_template(item['metadata']['source'], 'query', 'Long_context_agent')
+                    
+                    query = q_template.format(question=item['questions'][idx])
+                    answer = item['answers'][idx]
+                    # import pdb; pdb.set_trace()
+                    queries_and_answers.append([idx, query, answer, item['metadata']['source']])
+                all_queries_and_answers.append(queries_and_answers)
+        
         return all_queries_and_answers
+
+    def get_dataset_length(self):
+        return len(self.data)
\ No newline at end of file
diff --git a/public_evaluations/evals.py b/public_evaluations/evals.py
index ea53306..84c0296 100644
--- a/public_evaluations/evals.py
+++ b/public_evaluations/evals.py
@@ -33,11 +33,14 @@ def process_item(item):
     # Get category if it exists in metadata
     category = None
     if "metadata" in item and item["metadata"] is not None:
-        category = str(item["metadata"].get("category", "Unknown"))
+        #category = str(item["metadata"].get("category", "Unknown")) ##TODO: temp fix
+        pass 
 
     # Clean up question if it contains the approach instructions
     if "Question:" in question:
-        question = question.split("\n\n")[-1].strip().split("Question:")[1].strip()
+        #question = question.split("\n\n")[-1].strip().split("Question:")[1].strip()
+        question = question
+        # TODO: temp fix
     else:
         question = question
 
diff --git a/public_evaluations/log_utils.py b/public_evaluations/log_utils.py
new file mode 100644
index 0000000..0f0546f
--- /dev/null
+++ b/public_evaluations/log_utils.py
@@ -0,0 +1,178 @@
+import os
+import sys
+import contextlib
+import subprocess
+
+from conversation_creator import ConversationCreator
+from datetime import datetime
+from constants import CHUNK_SIZE_MEMORY_AGENT_BENCH
+
+
+def compute_run_out_dir(args, global_idx):
+    """Compute absolute output directory for a specific run to place logs alongside results.
+
+    This mirrors the directory structure in `run_instance.py` so logs align with results.
+    """
+    # Determine subset name and chunk size
+    if args.dataset == 'MemoryAgentBench':
+        conversation_creator = ConversationCreator(args.dataset, args.num_exp, args.sub_datasets)
+        all_queries_and_answers = conversation_creator.get_query_and_answer()
+        if global_idx >= len(all_queries_and_answers):
+            raise ValueError(
+                f"global_idx {global_idx} out of range for queries/answers (n={len(all_queries_and_answers)})"
+            )
+        queries_and_answers = all_queries_and_answers[global_idx]
+        subset_name = queries_and_answers[0][3]
+        chunk_size = CHUNK_SIZE_MEMORY_AGENT_BENCH[subset_name]
+    else:
+        subset_name = "None"
+        chunk_size = "None"
+
+    # Parent folder name matches run_instance.py
+    if args.agent_name == 'gpt-long-context' or args.agent_name == 'gemini-long-context':
+        parent_folder = f"{args.agent_name}_{args.dataset}-{args.model_name}"
+    else:
+        parent_folder = f"{args.agent_name}_{args.dataset}-model{args.model_name}"
+
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    abs_out_dir = os.path.join(
+        base_dir,
+        "logs",
+        parent_folder,
+        f"{global_idx}_subset{subset_name}_cksize{chunk_size}"
+    )
+
+    # Ensure the directory exists so logs can be created before child writes results
+    os.makedirs(abs_out_dir, exist_ok=True)
+    return abs_out_dir
+
+
+def setup_logs_for_run(args, global_idx):
+    """Ensure logs directory exists for this run and return log file paths.
+
+    Returns (out_dir_abs, logs_dir, parent_log_path, child_stdout_path, child_stderr_path)
+    """
+    out_dir_abs = compute_run_out_dir(args, global_idx)
+    logs_dir = os.path.join(out_dir_abs, "logs")
+    os.makedirs(logs_dir, exist_ok=True)
+
+    parent_log_path = os.path.join(logs_dir, "parent.log")
+    child_stdout_path = os.path.join(logs_dir, "child_stdout.log")
+    child_stderr_path = os.path.join(logs_dir, "child_stderr.log")
+
+    return out_dir_abs, logs_dir, parent_log_path, child_stdout_path, child_stderr_path
+
+
+class LogsPaths:
+    def __init__(self, parent_log_path, child_log_path):
+        self.parent = parent_log_path
+        self.child = child_log_path
+
+
+def prepare_logs_paths(args, global_idx):
+    """Prepare and return simplified logs paths object with combined child log.
+
+    Returns LogsPaths with .parent and .child attributes.
+    """
+    _, _, parent_log_path, child_stdout_path, _ = setup_logs_for_run(args, global_idx)
+    logs_dir = os.path.dirname(child_stdout_path)
+    ts = datetime.now().strftime('%Y%m%d_%H%M%S')
+    child_log_path = os.path.join(logs_dir, f"child_{ts}.log")
+    parent_log_path = os.path.join(os.path.dirname(parent_log_path), f"parent_{ts}.log")
+    # Ensure parent and child files' directories exist
+    os.makedirs(os.path.dirname(parent_log_path), exist_ok=True)
+    os.makedirs(os.path.dirname(child_log_path), exist_ok=True)
+    return LogsPaths(parent_log_path, child_log_path)
+
+
+class _Tee:
+    def __init__(self, primary_stream, secondary_stream):
+        self.primary_stream = primary_stream
+        self.secondary_stream = secondary_stream
+    def write(self, data):
+        try:
+            self.primary_stream.write(data)
+        except Exception:
+            pass
+        try:
+            self.secondary_stream.write(data)
+        except Exception:
+            pass
+    def flush(self):
+        try:
+            self.primary_stream.flush()
+        except Exception:
+            pass
+        try:
+            self.secondary_stream.flush()
+        except Exception:
+            pass
+    def isatty(self):
+        try:
+            return self.primary_stream.isatty()
+        except Exception:
+            return False
+    def fileno(self):
+        return self.primary_stream.fileno()
+
+
+@contextlib.contextmanager
+def tee_parent_logs(parent_log_path):
+    """Context manager to tee the parent process stdout/stderr to a log file."""
+    original_stdout = sys.stdout
+    original_stderr = sys.stderr
+    # Ensure parent log file directory exists
+    os.makedirs(os.path.dirname(parent_log_path), exist_ok=True)
+    with open(parent_log_path, 'a', encoding='utf-8') as parent_fh:
+        sys.stdout = _Tee(original_stdout, parent_fh)
+        sys.stderr = _Tee(original_stderr, parent_fh)
+        try:
+            yield
+        finally:
+            sys.stdout = original_stdout
+            sys.stderr = original_stderr
+
+
+def run_subprocess_with_logs(cmd, cwd, stdout_log_path, stderr_log_path, combine_streams=False):
+    """Run a subprocess redirecting stdout/stderr to the given log files.
+
+    If combine_streams is True, stderr is redirected to stdout and only stdout_log_path is used.
+    """
+    # Ensure directories exist
+    if stdout_log_path:
+        os.makedirs(os.path.dirname(stdout_log_path), exist_ok=True)
+    if not combine_streams and stderr_log_path:
+        os.makedirs(os.path.dirname(stderr_log_path), exist_ok=True)
+
+    stdout_fh = open(stdout_log_path if stdout_log_path else os.devnull, 'a', encoding='utf-8')
+    stderr_fh = None
+    try:
+        if combine_streams:
+            return subprocess.run(
+                cmd,
+                cwd=cwd,
+                check=True,
+                stdout=stdout_fh,
+                stderr=subprocess.STDOUT,
+            )
+        else:
+            stderr_fh = open(stderr_log_path if stderr_log_path else os.devnull, 'a', encoding='utf-8')
+            return subprocess.run(
+                cmd,
+                cwd=cwd,
+                check=True,
+                stdout=stdout_fh,
+                stderr=stderr_fh,
+            )
+    finally:
+        try:
+            stdout_fh.close()
+        except Exception:
+            pass
+        if stderr_fh is not None:
+            try:
+                stderr_fh.close()
+            except Exception:
+                pass
+
+
diff --git a/public_evaluations/main.py b/public_evaluations/main.py
index 93d7d03..2ce917d 100644
--- a/public_evaluations/main.py
+++ b/public_evaluations/main.py
@@ -6,25 +6,31 @@
 import argparse
 import numpy as np
 import subprocess
+import tempfile
 from tqdm import tqdm
+from log_utils import tee_parent_logs, run_subprocess_with_logs, prepare_logs_paths
 from conversation_creator import ConversationCreator
 
-
-
+## CONSTANTS for chunk size moved to constants.py to avoid circular imports
+## python main.py --agent_name mirix --dataset LOCOMO --config_path ../mirix/configs/mirix_azure_example.yaml
+## python main.py --agent_name mirix --dataset MemoryAgentBench --config_path ../mirix/configs/mirix_azure_example.yaml --num_exp 2
 def parse_args():
     parser = argparse.ArgumentParser(description="Multi-Modal Memory Illustration")
     parser.add_argument("--agent_name", type=str, choices=['gpt-long-context', 'mirix', 'siglip', 'gemini-long-context'])
-    parser.add_argument("--dataset", type=str, default="LOCOMO", choices=['LOCOMO', 'ScreenshotVQA'])
-    parser.add_argument("--num_exp", type=int, default=-1)
+    parser.add_argument("--dataset", type=str, default="LOCOMO", choices=['LOCOMO', 'ScreenshotVQA', 'MemoryAgentBench'])
+    parser.add_argument("--num_exp", type=int, default=5)
     parser.add_argument("--load_db_from", type=str, default=None)
     parser.add_argument("--num_images_to_accumulate", default=None, type=int)
     parser.add_argument("--global_idx", type=int, default=None)
     parser.add_argument("--model_name", type=str, default="gpt-4.1-mini", help="Model name to use for gpt-long-context agent")
     parser.add_argument("--config_path", type=str, default=None, help="Config file path for mirix agent")
     parser.add_argument("--force_answer_question", action="store_true", default=False)
+    # for MemoryAgentBench / , "eventqa_full"
+    parser.add_argument("--sub_datasets", nargs='+', type=str, default=["longmemeval_s*"], help="Sub-datasets to run")
+    
     return parser.parse_args()
 
-def run_subprocess_interactive(args, global_idx):
+def run_subprocess_interactive(args, global_idx, logs=None):
     """
     Run the run_instance.py script using subprocess with interactive capability.
     """
@@ -34,7 +40,8 @@ def run_subprocess_interactive(args, global_idx):
         '--agent_name', args.agent_name,
         '--dataset', args.dataset,
         '--global_idx', str(global_idx),
-        '--num_exp', str(args.num_exp)
+        '--num_exp', str(args.num_exp),
+        '--sub_datasets', *args.sub_datasets
     ]
     
     # Add optional arguments
@@ -46,36 +53,71 @@ def run_subprocess_interactive(args, global_idx):
         cmd.append('--force_answer_question')
     
     try:
-        # Run the subprocess without capturing output (allows interactive debugging)
         print(f"Running subprocess for global_idx {global_idx}")
-        result = subprocess.run(cmd, cwd=os.path.dirname(os.path.abspath(__file__)), 
-                              check=True)  # No capture_output=True
-        
+        parent_log_path = logs.parent if logs is not None else None
+        if parent_log_path:
+            try:
+                with open(parent_log_path, 'a', encoding='utf-8') as plog:
+                    plog.write(f"[main] Starting subprocess for global_idx {global_idx}: {' '.join(cmd)}\n")
+            except Exception:
+                pass
+
+        result = run_subprocess_with_logs(
+            cmd,
+            cwd=os.path.dirname(os.path.abspath(__file__)),
+            stdout_log_path=(logs.child if logs is not None else None),
+            stderr_log_path=None,
+            combine_streams=True,
+        )
+
         print(f"Subprocess completed successfully for global_idx {global_idx}")
-            
+        if parent_log_path:
+            try:
+                with open(parent_log_path, 'a', encoding='utf-8') as plog:
+                    plog.write(f"[main] Subprocess completed for global_idx {global_idx} (rc=0)\n")
+            except Exception:
+                pass
     except subprocess.CalledProcessError as e:
         print(f"Subprocess failed for global_idx {global_idx} with return code {e.returncode}")
+        if parent_log_path:
+            try:
+                with open(parent_log_path, 'a', encoding='utf-8') as plog:
+                    plog.write(f"[main] Subprocess failed for global_idx {global_idx} (rc={e.returncode})\n")
+            except Exception:
+                pass
         raise
 
 def main():
     
+    # parse arguments
     args = parse_args()
-    conversation_creator = ConversationCreator(args.dataset, args.num_exp)
-
-    if args.agent_name == 'gpt-long-context':
-        with_instructions = False
-    else: 
-        with_instructions = True
-
-    all_chunks = conversation_creator.chunks(with_instructions=with_instructions)
-    all_queries_and_answers = conversation_creator.get_query_and_answer()
+    
+    # initialize run count
+    conversation_creator = ConversationCreator(args.dataset, args.num_exp, args.sub_datasets)
+    dataset_length = conversation_creator.get_dataset_length()
 
-    for global_idx, (chunks, queries_and_answers) in enumerate(zip(all_chunks, all_queries_and_answers)):
+    for global_idx in tqdm(range(dataset_length), desc="Running subprocesses", unit="item"):
         
         if args.global_idx is not None and global_idx != args.global_idx:
             continue
         
-        run_subprocess_interactive(args, global_idx)
+        logs = prepare_logs_paths(args, global_idx)
+        with tee_parent_logs(logs.parent):
+            try:
+                run_subprocess_interactive(
+                    args,
+                    global_idx,
+                    logs=logs,
+                )
+            except Exception as e:
+                # Log the error and continue with the next item
+                try:
+                    with open(logs.parent, 'a', encoding='utf-8') as plog:
+                        plog.write(f"[main] Error during run for global_idx {global_idx}: {repr(e)}\n")
+                except Exception:
+                    pass
+                print(f"Encountered error for global_idx {global_idx}: {e}. Continuing to next.")
+                continue
 
 if __name__ == '__main__':
     main()
diff --git a/public_evaluations/run_instance.py b/public_evaluations/run_instance.py
old mode 100755
new mode 100644
index 38ae6bc..4e19bd5
--- a/public_evaluations/run_instance.py
+++ b/public_evaluations/run_instance.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 from dotenv import load_dotenv
-load_dotenv("../.env")
+load_dotenv()
 
 import os
 import json
@@ -9,17 +9,19 @@
 from tqdm import tqdm
 from agent import AgentWrapper
 from conversation_creator import ConversationCreator
-
+from constants import CHUNK_SIZE_MEMORY_AGENT_BENCH
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Run instance with chunks and questions")
     parser.add_argument("--agent_name", type=str, required=True, choices=['gpt-long-context', 'mirix', 'siglip', 'gemini-long-context'])
-    parser.add_argument("--dataset", type=str, default="LOCOMO", choices=['LOCOMO', 'ScreenshotVQA'])
+    parser.add_argument("--dataset", type=str, default="LOCOMO", choices=['LOCOMO', 'ScreenshotVQA', 'MemoryAgentBench'])
     parser.add_argument("--global_idx", type=int, required=True)
     parser.add_argument("--num_exp", type=int, default=-1)
-    parser.add_argument("--model_name", type=str, default="gpt-4.1", help="Model name to use for gpt-long-context agent")
+    parser.add_argument("--model_name", type=str, default="gpt-4.1-mini", help="Model name to use for gpt-long-context agent")
     parser.add_argument("--config_path", type=str, default=None, help="Config file path for mirix agent")
     parser.add_argument("--force_answer_question", action="store_true", default=False)
+    parser.add_argument("--sub_datasets", nargs='+', type=str, default=["longmemeval_s*"], help="Sub-datasets to run")
+     
     return parser.parse_args()
 
 
@@ -28,22 +30,32 @@ def run_with_chunks_and_questions(
         global_idx,
         chunks, 
         queries_and_answers):
-
-    out_dir = f"./results/{args.agent_name}_{args.dataset}/"
+    
+    # dataset metadata
+    if args.dataset == 'MemoryAgentBench':
+        subset_name = queries_and_answers[0][3]
+        chunk_size = CHUNK_SIZE_MEMORY_AGENT_BENCH[subset_name]
+    else:
+        subset_name = "None"
+        chunk_size = "None"
+    
+    # make out_dir with the model name / save all the parameters
     if args.agent_name == 'gpt-long-context' or args.agent_name == 'gemini-long-context':
         out_dir = f"./results/{args.agent_name}_{args.dataset}-{args.model_name}/"
+    else:
+        out_dir = f"./results/{args.agent_name}_{args.dataset}-model{args.model_name}/"
 
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
+    
+    out_dir = out_dir + f"{global_idx}_subset{subset_name}_cksize{chunk_size}"
 
-    out_dir = out_dir + f"{global_idx}"
-
+    
+    # if out_dir exists, load the agent from it
     if os.path.exists(out_dir):
-        
         agent = AgentWrapper(args.agent_name, load_agent_from=out_dir, model_name=args.model_name, config_path=args.config_path)
-    
+    # create an agent
     else:
-
         if args.agent_name == 'mirix':
             if os.path.exists(os.path.expanduser(f"~/.mirix/sqlite.db")):
                 # need to delete the existing db
@@ -51,6 +63,8 @@ def run_with_chunks_and_questions(
 
         agent = AgentWrapper(args.agent_name, model_name=args.model_name, config_path=args.config_path)
 
+
+    # load the current step & chunks for continuing memory accumulation
     if os.path.exists(f"{out_dir}/current_step.txt"):
         with open(f"{out_dir}/current_step.txt", "rb") as f:
             current_step = int(f.read().decode())
@@ -87,20 +101,28 @@ def run_with_chunks_and_questions(
                 'response': response
             })
 
-        if args.agent_name == 'mirix':
-            agent.save_agent(out_dir)
+        # save the chunks and current step in chunking
+        # TODO: if args.agent_name == 'mirix':
+        agent.save_agent(out_dir)
 
-            with open(f"{out_dir}/current_step.txt", "wb") as f:
-                f.write(str(idx).encode())
+        with open(f"{out_dir}/chunks.json", "w") as f:
+            json.dump(existing_chunks, f, indent=2)
 
-            with open(f"{out_dir}/chunks.json", "w") as f:
-                json.dump(existing_chunks, f, indent=2)
+        with open(f"{out_dir}/current_step.txt", "wb") as f:
+            f.write(str(idx).encode())
 
 
+    # save the agent
     agent.save_agent(out_dir)
-
     agent.prepare_before_asking_questions()
 
+
+    # save the parameters
+    with open(f"{out_dir}/parameters.json", "w") as f:
+        json.dump(args.__dict__, f, indent=2)
+        
+        
+    # load the results to continue from last breakpoint
     if os.path.exists(f"{out_dir}/results.json"):
         existing_results = json.load(open(f"{out_dir}/results.json", "r"))
     else:
@@ -110,22 +132,26 @@ def run_with_chunks_and_questions(
 
     all_questions = [x['question'] for x in existing_results]
 
+
+    # QA loop
     for item in queries_and_answers:
 
-        if (item[3]['question'] if len(item) > 3 else item[1]) in all_questions:
-            item_idx = all_questions.index(item[3]['question'] if len(item) > 3 else item[1])
+        question_text = item[1]
+
+        if question_text in all_questions:
+            item_idx = all_questions.index(question_text)
             if 'metadata' not in existing_results[item_idx]:
-                existing_results[item_idx]['metadata'] = item[3]
+                existing_results[item_idx]['metadata'] = item[3] if len(item) > 3 else None
                 with open(f"{out_dir}/results.json", "w") as f:
                     json.dump(existing_results, f, indent=2)
             continue
-        print("Question [{} / {}]: ".format(len(existing_results), len(queries_and_answers)), item[3]['question'] if len(item) > 3 else item[1])
+        print("Question [{} / {}]: ".format(len(existing_results), len(queries_and_answers)), question_text)
 
-        response = agent.send_message(item[1], memorizing=False)
+        response = agent.send_message(question_text, memorizing=False)
 
         existing_results.append(
             {
-                'question': item[3]['question'] if len(item) > 3 else item[1],
+                'question': question_text,
                 'response': response,
                 'answer': item[2],
                 'metadata': item[3] if len(item) > 3 else None
@@ -135,6 +161,7 @@ def run_with_chunks_and_questions(
         with open(f"{out_dir}/results.json", "w") as f:
             json.dump(existing_results, f, indent=2)
         
+        # need to delete the existing db
         if args.agent_name == 'mirix':
             if os.path.exists(os.path.expanduser(f"~/.mirix/sqlite.db")):
                 # need to delete the existing db
@@ -146,8 +173,8 @@ def main():
     args = parse_args()
     
     # Create ConversationCreator and load data for the specific global_idx
-    conversation_creator = ConversationCreator(args.dataset, args.num_exp)
-    
+    conversation_creator = ConversationCreator(args.dataset, args.num_exp, args.sub_datasets)
+
     # Determine with_instructions based on agent_name
     if args.agent_name == 'gpt-long-context':
         with_instructions = False
@@ -169,4 +196,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file