- 
                Notifications
    
You must be signed in to change notification settings  - Fork 84
 
Feature: categorization of notebooks to simplify huge repositories navigation #190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
0c5b9ee
              c8e81f4
              fc3d2a7
              2f114bc
              733573f
              97a90f5
              d74373e
              c31e8a5
              fd8523f
              f663178
              33e7d0e
              db67d7d
              bda0994
              eee88d0
              fd947d8
              6e73cdc
              9fad0e9
              ad644b5
              f1ddbe3
              3263553
              ee43675
              2cf120e
              1563576
              0b0b713
              3eaefcd
              05e210d
              ced0ced
              df92fab
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| 
                       There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we please add some documentation/screenshots of the functionality?  | 
            
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -70,6 +70,12 @@ def filesystem_default_value(dirname): | |
| is_flag=True, | ||
| help="If selected, notebooker set current working directory to absolute path of the notebook to keep it local context available", | ||
| ) | ||
| @click.option( | ||
| "--categorization", | ||
| default=False, | ||
| is_flag=True, | ||
| help="If selected, discovers only templates with the 'category=example' tags set to any cell and groups notebooks by their category names", | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the expected behaviour when multiple cells in the same notebook have clashing categories?  | 
||
| ) | ||
| @click.option( | ||
| "--default-mailfrom", default=DEFAULT_MAILFROM_ADDRESS, help="Set a new value for the default mailfrom setting." | ||
| ) | ||
| 
        
          
        
         | 
    @@ -91,6 +97,7 @@ def base_notebooker( | |
| py_template_subdir, | ||
| notebooker_disable_git, | ||
| execute_at_origin, | ||
| categorization, | ||
| default_mailfrom, | ||
| running_timeout, | ||
| serializer_cls, | ||
| 
        
          
        
         | 
    @@ -106,6 +113,7 @@ def base_notebooker( | |
| PY_TEMPLATE_SUBDIR=py_template_subdir, | ||
| NOTEBOOKER_DISABLE_GIT=notebooker_disable_git, | ||
| EXECUTE_AT_ORIGIN=execute_at_origin, | ||
| CATEGORIZATION=categorization, | ||
| DEFAULT_MAILFROM=default_mailfrom, | ||
| RUNNING_TIMEOUT=running_timeout, | ||
| ) | ||
| 
          
            
          
           | 
    @@ -180,6 +188,7 @@ def start_webapp( | |
| 
     | 
||
| @base_notebooker.command() | ||
| @click.option("--report-name", help="The name of the template to execute, relative to the template directory.") | ||
| @click.option("--category", default="", help="Category of the template.") | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't this come from the cell tags? How come this needs to be passed in?  | 
||
| @click.option( | ||
| "--overrides-as-json", default="{}", help="The parameters to inject into the notebook template, in JSON format." | ||
| ) | ||
| 
          
            
          
           | 
    @@ -230,6 +239,7 @@ def start_webapp( | |
| def execute_notebook( | ||
| config: BaseConfig, | ||
| report_name, | ||
| category, | ||
| overrides_as_json, | ||
| iterate_override_values_of, | ||
| report_title, | ||
| 
        
          
        
         | 
    @@ -250,6 +260,7 @@ def execute_notebook( | |
| return execute_notebook_entrypoint( | ||
| config, | ||
| report_name, | ||
| category, | ||
| overrides_as_json, | ||
| iterate_override_values_of, | ||
| report_title, | ||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -54,6 +54,7 @@ def _run_checks( | |
| scheduler_job_id: Optional[str] = None, | ||
| mailfrom: Optional[str] = None, | ||
| is_slideshow: bool = False, | ||
| category: Optional[str] = None, | ||
| ) -> NotebookResultComplete: | ||
| """ | ||
| This is the actual method which executes a notebook, whether running in the webapp or via the entrypoint. | ||
| 
          
            
          
           | 
    @@ -152,6 +153,7 @@ def _run_checks( | |
| generate_pdf_output=generate_pdf_output, | ||
| report_name=template_name, | ||
| report_title=report_title, | ||
| category=category, | ||
| overrides=overrides, | ||
| scheduler_job_id=scheduler_job_id, | ||
| mailfrom=mailfrom, | ||
| 
        
          
        
         | 
    @@ -164,6 +166,7 @@ def _run_checks( | |
| def run_report( | ||
| job_submit_time, | ||
| report_name, | ||
| category, | ||
| overrides, | ||
| result_serializer, | ||
| report_title="", | ||
| 
          
            
          
           | 
    @@ -222,6 +225,7 @@ def run_report( | |
| scheduler_job_id=scheduler_job_id, | ||
| mailfrom=mailfrom, | ||
| is_slideshow=is_slideshow, | ||
| category=category, | ||
| ) | ||
| logger.info("Successfully got result.") | ||
| result_serializer.save_check_result(result) | ||
| 
        
          
        
         | 
    @@ -234,6 +238,7 @@ def run_report( | |
| job_start_time=job_submit_time, | ||
| report_name=report_name, | ||
| report_title=report_title, | ||
| category=category, | ||
| error_info=error_info, | ||
| overrides=overrides, | ||
| mailto=mailto, | ||
| 
        
          
        
         | 
    @@ -257,6 +262,7 @@ def run_report( | |
| return run_report( | ||
| job_submit_time, | ||
| report_name, | ||
| category, | ||
| overrides, | ||
| result_serializer, | ||
| report_title=report_title, | ||
| 
          
            
          
           | 
    @@ -351,6 +357,7 @@ def _get_overrides(overrides_as_json: AnyStr, iterate_override_values_of: Option | |
| def execute_notebook_entrypoint( | ||
| config: BaseConfig, | ||
| report_name: str, | ||
| category: str, | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an   | 
||
| overrides_as_json: str, | ||
| iterate_override_values_of: Union[List[str], str], | ||
| report_title: str, | ||
| 
        
          
        
         | 
    @@ -377,6 +384,7 @@ def execute_notebook_entrypoint( | |
| start_time = datetime.datetime.now() | ||
| logger.info("Running a report with these parameters:") | ||
| logger.info("report_name = %s", report_name) | ||
| logger.info("category = %s", category) | ||
| logger.info("overrides_as_json = %s", overrides_as_json) | ||
| logger.info("iterate_override_values_of = %s", iterate_override_values_of) | ||
| logger.info("report_title = %s", report_title) | ||
| 
          
            
          
           | 
    @@ -407,6 +415,7 @@ def execute_notebook_entrypoint( | |
| result = run_report( | ||
| start_time, | ||
| report_name, | ||
| category, | ||
| overrides, | ||
| result_serializer, | ||
| report_title=report_title, | ||
| 
          
            
          
           | 
    @@ -495,6 +504,7 @@ def run_report_in_subprocess( | |
| email_subject=None, | ||
| n_retries=3, | ||
| is_slideshow=False, | ||
| category=None, | ||
| ) -> str: | ||
| """ | ||
| Execute the Notebooker report in a subprocess. | ||
| 
        
          
        
         | 
    @@ -513,6 +523,7 @@ def run_report_in_subprocess( | |
| :param email_subject: `str` if passed, then this string will be used in the email subject | ||
| :param n_retries: The number of retries to attempt. | ||
| :param is_slideshow: Whether the notebook is a reveal.js slideshow or not. | ||
| :param category: Category of the notebook | ||
| :return: The unique job_id. | ||
| """ | ||
| if error_mailto is None: | ||
| 
        
          
        
         | 
    @@ -535,6 +546,7 @@ def run_report_in_subprocess( | |
| is_slideshow=is_slideshow, | ||
| email_subject=email_subject, | ||
| mailfrom=mailfrom, | ||
| category=category, | ||
| ) | ||
| 
     | 
||
| command = ( | ||
| 
          
            
          
           | 
    @@ -578,6 +590,7 @@ def run_report_in_subprocess( | |
| + (["--is-slideshow"] if is_slideshow else []) | ||
| + ([f"--scheduler-job-id={scheduler_job_id}"] if scheduler_job_id is not None else []) | ||
| + ([f"--mailfrom={mailfrom}"] if mailfrom is not None else []) | ||
| + ([f"--category={category}"] if category is not None else []) | ||
| + ([f"--email-subject={email_subject}"] if email_subject else []) | ||
| ) | ||
| p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| 
          
            
          
           | 
    @@ -209,6 +209,7 @@ def save_check_stub( | |||||
| is_slideshow: bool = False, | ||||||
| email_subject: Optional[str] = None, | ||||||
| mailfrom: Optional[str] = None, | ||||||
| category: Optional[str] = None, | ||||||
| ) -> None: | ||||||
| """Call this when we are just starting a check. Saves a "pending" job into storage.""" | ||||||
| job_start_time = job_start_time or datetime.datetime.now() | ||||||
| 
        
          
        
         | 
    @@ -228,6 +229,7 @@ def save_check_stub( | |||||
| scheduler_job_id=scheduler_job_id, | ||||||
| is_slideshow=is_slideshow, | ||||||
| mailfrom=mailfrom, | ||||||
| category=category, | ||||||
| ) | ||||||
| self._save_to_db(pending_result) | ||||||
| 
     | 
||||||
| 
          
            
          
           | 
    @@ -325,6 +327,7 @@ def _convert_result( | |||||
| scheduler_job_id=result.get("scheduler_job_id", None), | ||||||
| is_slideshow=result.get("is_slideshow", False), | ||||||
| email_subject=result.get("email_subject", None), | ||||||
| category=result.get("category", None), | ||||||
| ) | ||||||
| elif cls == NotebookResultPending: | ||||||
| return NotebookResultPending( | ||||||
| 
        
          
        
         | 
    @@ -344,6 +347,7 @@ def _convert_result( | |||||
| stdout=result.get("stdout", []), | ||||||
| scheduler_job_id=result.get("scheduler_job_id", None), | ||||||
| is_slideshow=result.get("is_slideshow", False), | ||||||
| category=result.get("category", None), | ||||||
| ) | ||||||
| 
     | 
||||||
| elif cls == NotebookResultError: | ||||||
| 
        
          
        
         | 
    @@ -370,6 +374,7 @@ def _convert_result( | |||||
| stdout=result.get("stdout", []), | ||||||
| scheduler_job_id=result.get("scheduler_job_id", False), | ||||||
| is_slideshow=result.get("is_slideshow", False), | ||||||
| category=result.get("category", None), | ||||||
| ) | ||||||
| else: | ||||||
| raise ValueError("Could not deserialise {} into result object.".format(result)) | ||||||
| 
          
            
          
           | 
    @@ -397,10 +402,17 @@ def _get_result_count(self, base_filter): | |||||
| 
     | 
||||||
| def get_count_and_latest_time_per_report(self, subfolder: Optional[str]): | ||||||
| base_filer = {} if not subfolder else {"report_name": {"$regex": subfolder + ".*"}} | ||||||
| return self.fetch_reports(base_filer) | ||||||
| 
     | 
||||||
| def get_count_and_latest_time_per_report_per_category(self, category: Optional[str]): | ||||||
| base_filer = {} if not category else {"category": category} | ||||||
| return self.fetch_reports(base_filer) | ||||||
| 
     | 
||||||
| def fetch_reports(self, base_filer: Dict[str, Any]): | ||||||
| reports = list( | ||||||
| self._get_raw_results( | ||||||
| base_filter=base_filer, | ||||||
| projection={"report_name": 1, "job_start_time": 1, "scheduler_job_id": 1, "_id": 0}, | ||||||
| projection={"report_name": 1, "job_start_time": 1, "scheduler_job_id": 1, "category": 1, "_id": 0}, | ||||||
| limit=0, | ||||||
| ) | ||||||
| ) | ||||||
| 
        
          
        
         | 
    @@ -411,7 +423,12 @@ def get_count_and_latest_time_per_report(self, subfolder: Optional[str]): | |||||
| for report, all_runs in jobs_by_name.items(): | ||||||
| latest_start_time = max(r["job_start_time"] for r in all_runs) | ||||||
| scheduled_runs = len([x for x in all_runs if x.get("scheduler_job_id")]) | ||||||
| output[report] = {"count": len(all_runs), "latest_run": latest_start_time, "scheduler_runs": scheduled_runs} | ||||||
| output[report] = { | ||||||
| "count": len(all_runs), | ||||||
| "latest_run": latest_start_time, | ||||||
| "scheduler_runs": scheduled_runs, | ||||||
| "category": r["category"], | ||||||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This key will not exist if it is not in the mongo document. 
        Suggested change
       
    
  | 
||||||
| } | ||||||
| return output | ||||||
| 
     | 
||||||
| def get_all_results( | ||||||
| 
          
            
          
           | 
    ||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| 
          
            
          
           | 
    @@ -28,6 +28,9 @@ class BaseConfig: | |||||
| # A boolean flag to dictate whether we should execute the notebook at the origin or not. | ||||||
| EXECUTE_AT_ORIGIN: bool = False | ||||||
| 
     | 
||||||
| # A boolean flag to dictate whether we should discover and group notebooker by their category tags. | ||||||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 
        Suggested change
       
    
  | 
||||||
| CATEGORIZATION: bool = False | ||||||
| 
     | 
||||||
| # The serializer class we are using for storage, e.g. PyMongoResultSerializer | ||||||
| SERIALIZER_CLS: DEFAULT_SERIALIZER = None | ||||||
| # The dictionary of parameters which are used to initialize the serializer class above | ||||||
| 
          
            
          
           | 
    ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we please revert the version bump until we have ensured the tests pass on master