Health-Informatics-UoN
diff --git a/‎README.md‎
Lines changed: 3 additions & 9 deletions b/‎README.md‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎analysis_orchestrator.py‎
Lines changed: 23 additions & 0 deletions b/‎analysis_orchestrator.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎analysis_runner.py‎
Lines changed: 4 additions & 4 deletions b/‎analysis_runner.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎analytics_tes.py‎
Lines changed: 8 additions & 6 deletions b/‎analytics_tes.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎bunny_tes.py‎
Lines changed: 16 additions & 9 deletions b/‎bunny_tes.py‎
Lines changed: 16 additions & 9 deletions
diff --git a/‎docker/README.md‎
Lines changed: 42 additions & 0 deletions b/‎docker/README.md‎
Lines changed: 42 additions & 0 deletions
@@ -29,19 +29,13 @@ All variables in `env.example` are **required**. Here's what you need to configu
 TES_BASE_URL=http://your-tes-endpoint:5034/v1/tasks
 TES_DOCKER_IMAGE=harbor.your-registry.com/your-image:tag
 
-# Database Configuration
-DB_HOST=your-database-host
-DB_PORT=5432
-DB_USERNAME=your-database-username
-DB_PASSWORD=your-database-password
-DB_NAME=your-database-name
-
 # MinIO Configuration
 MINIO_STS_ENDPOINT=http://your-minio-endpoint:9000/sts
 MINIO_ENDPOINT=your-minio-endpoint:9000
 MINIO_OUTPUT_BUCKET=your-output-bucket-name
 ```
 
+
 ### 3. Installation
 
 ```bash
@@ -66,13 +60,13 @@ import os
 analytics_tes = AnalyticsTES()
 orchestrator = AnalysisOrchestrator(tes_client=analytics_tes)
 analysis_runner = AnalysisRunner(tes_client=analytics_tes)
-sql_schema = os.getenv("SQL_SCHEMA", "public")
+sql_schema = os.getenv("postgresSchema", "public")
 
 
 
 # Define your own SQL query
 query_template = Template("""WITH user_query AS (
-  SELECT value_as_number FROM $schema.measurement 
+  SELECT value_as_number FROM $sql_schema.measurement 
   WHERE measurement_concept_id = 21490742
   AND value_as_number IS NOT NULL
 )
 
@@ -41,8 +41,11 @@ def __init__(
         self.token_session = token_session 
         self.project = project
         self.tes_client = tes_client
+        ## set to None here to be explicitly set later, either by passing args or environment variables
+        self.tres = None
         self.minio_client = MinIOClient(token_session=token_session)
 
+        
     def parse_tres(self, tres: str) -> List[str]:
         """
         Parse the TREs from the environment variable.
@@ -114,13 +117,33 @@ def _submit_and_collect_results(self,
 
         task_id = result['id']
         print(f"Task ID: {task_id}")
+        
         results_paths = [f"{int(task_id) + i + 1}/output.{output_format}" for i in range(n_results)]
 
         # Use polling engine to collect results
         polling_engine = polling.Polling(self.tes_client, self.minio_client, task_id)
         data = polling_engine.poll_results(results_paths, bucket, n_results, polling_interval=10)
 
         return task_id, data
+
+    def collect_results(self, task_id: str, token: str = None, bucket: str=None, output_format: str = "json"):
+        if token is None:
+            token = os.getenv('5STES_TOKEN')
+            if not token:
+                raise ValueError("5STES_TOKEN environment variable is required when token parameter is not provided")
+        self.token = token
+        if self.tres is None:
+            tres = os.getenv('5STES_TRES')
+            if not tres:
+                raise ValueError("5STES_TRES environment variable is required when tres parameter is not provided")
+            self.tres = self.parse_tres(tres)
+        if bucket is None:
+            bucket = os.getenv('MINIO_OUTPUT_BUCKET')
+            if not bucket:
+                raise ValueError("MINIO_OUTPUT_BUCKET environment variable is required when bucket parameter is not provided")
+        n_results = len(self.tres)
+        results_paths = [f"{int(task_id) + i + 1}/output.{output_format}" for i in range(n_results)]
+        return self._collect_results(results_paths, bucket, n_results)
 
     def _collect_results(self, results_paths: List[str], bucket: str, n_results: int) -> List[str]:
         """
 
@@ -261,14 +261,14 @@ def get_supported_analysis_types(self) -> List[str]:
     analysis_runner = AnalysisRunner()
 
     ## need this to populate the query template
-    sql_schema = os.getenv("SQL_SCHEMA", "public")
+    sql_schema = os.getenv("postgresSchema", "public")
 
     # Example: Run variance analysis first, then mean analysis on the same data
-    query_template = Template("""SELECT value_as_number FROM $schema.measurement 
-WHERE measurement_concept_id = 21490742
+    query_template = Template("""SELECT value_as_number FROM $sql_schema.measurement 
+WHERE measurement_concept_id = 43055141
 AND value_as_number IS NOT NULL""")
 
-    user_query = query_template.safe_substitute(schema=sql_schema)
+    user_query = query_template.safe_substitute(sql_schema=sql_schema)
 
     print("Running mean analysis...")
     mean_result = analysis_runner.run_analysis(
 
@@ -25,10 +25,10 @@ def _set_env(self) -> None:
         Set the environment variables for a TES task.
         """
         self.env = {
-            "DATASOURCE_DB_DATABASE": self.default_db_config['name'],
-            "DATASOURCE_DB_HOST": self.default_db_config['host'],
-            "DATASOURCE_DB_PASSWORD": self.default_db_config['password'],
-            "DATASOURCE_DB_USERNAME": self.default_db_config['username']
+            "postgresDatabase": self.default_db_config['name'],
+            "postgresServer": self.default_db_config['host'],
+            "postgresPassword": self.default_db_config['password'],
+            "postgresUsername": self.default_db_config['username']
         }
         return None
 
@@ -37,14 +37,16 @@ def _set_command(self, query: str, analysis_type: str, output_path: str, output_
         Set the command for a TES task.
         """
 
-        connection_string = f"postgresql://postgres:{self.default_db_config['password']}@{self.default_db_config['host']}:{self.default_db_config['port']}/{self.default_db_config['name']}"
+        
         self.command = [
             f"--user-query={query}",
             f"--analysis={analysis_type}",
-            f"--db-connection={connection_string}",
             f"--output-filename={output_path}/output",
             f"--output-format={output_format}"
         ]
+        ## add this one if you want to override the injected connection vars with a command line argument.
+        # connection_string = f"postgresql://postgres:{self.default_db_config['password']}@{self.default_db_config['host']}:{self.default_db_config['port']}/{self.default_db_config['name']}"
+        #f"--db-connection={connection_string}",
         return None
 
     def set_executors(self, query, analysis_type, workdir = "/app", output_path="/outputs", output_format="json") -> None:
 
@@ -19,9 +19,9 @@ def __init__(self, *args, **kwargs):
         self.task_api_username = os.getenv('TASK_API_USERNAME')
         self.task_api_password = os.getenv('TASK_API_PASSWORD')
 
-        # Add schema to default_db_config if not already present
-        if 'schema' not in self.default_db_config:
-            self.default_db_config['schema'] = os.getenv('SQL_SCHEMA')  # None if not set - will fail naturally if needed
+        # Schema: use postgresSchema throughout; entrypoint passes it to bunny as DATASOURCE_DB_SCHEMA
+        if not self.default_db_config.get('schema'):
+            self.default_db_config['schema'] = os.getenv('postgresSchema')
 
    #### this section will be implemented for each type of task using the pytes classes. Note that many of these fields are set in the submission layer after submission.
     def set_inputs(self) -> None:
@@ -43,14 +43,21 @@ def set_outputs(self, name: str, output_path: str, output_type: str = "DIRECTORY
     def _set_env(self) -> None:
         """
         Set the environment variables for a TES task.
+        Container entrypoint reads postgres* and exports DATASOURCE_DB_* for bunny; we set both.
         """
+        db = self.default_db_config
+        schema = db.get('schema') or 'public'
+        port = str(db.get('port') or '5432')
         self.env = {
-            "DATASOURCE_DB_DATABASE": self.default_db_config['name'],
-            "DATASOURCE_DB_HOST": self.default_db_config['host'],
-            "DATASOURCE_DB_PASSWORD": self.default_db_config['password'],
-            "DATASOURCE_DB_USERNAME": self.default_db_config['username'],
-            "DATASOURCE_DB_PORT": self.default_db_config['port'],
-            "DATASOURCE_DB_SCHEMA": self.default_db_config['schema'],
+            # Names the bunny-wrapper entrypoint reads (postgres* → DATASOURCE_DB_*)
+            "postgresDatabase": db['name'],
+            "postgresServer": db['host'],
+            "postgresPort": port,
+            "postgresSchema": schema,
+            "postgresUsername": db['username'],
+            "postgresPassword": db['password'],
+
+            # Bunny / task API
             "TASK_API_BASE_URL": self.task_api_base_url,
             "TASK_API_USERNAME": self.task_api_username,
             "TASK_API_PASSWORD": self.task_api_password,
 
@@ -0,0 +1,42 @@
+# Docker: Node-side query and analysis
+
+This folder contains the code that runs **inside the TES container** on each TRE node. It executes the user’s analysis (SQL plus optional Python) against the node’s database and writes the result (e.g. JSON) for the client to aggregate.
+
+## Purpose
+
+- The **user query** and **analysis type** are passed in as CLI arguments.
+- The **analysis type** is looked up in the `LOCAL_PROCESSING_CLASSES` registry in `local_processing.py`.
+- Each analysis class is responsible for:
+  - Building the SQL query (from the user query + analysis-specific logic),
+  - Running it against the node DB,
+  - Optional Python-side analysis on the result.
+- Results are written to file (e.g. JSON) and later collected and aggregated on the client side.
+
+So this code does the **per-node, partial** work; aggregation across TREs happens elsewhere (orchestrator / client).
+
+## Flow
+
+1. **Entrypoint** — Container runs `python query_resolver.py` with CLI args (`--user-query`, `--analysis`, `--db-connection` or env, `--output-filename`, `--output-format`).
+2. **query_resolver.py** — Parses the connection string (from env or `--db-connection`), then calls `process_query()`.
+3. **process_query()** — Resolves the DB connection, looks up the analysis in `LOCAL_PROCESSING_CLASSES`, instantiates the processor, builds and runs the query, runs optional Python analysis, and writes the result to disk.
+4. **local_processing.py** — Defines the registry and analysis classes (e.g. Mean, Variance, PMCC, ContingencyTable). Each class extends `BaseLocalProcessing` (from `local_processing_base.py`) and implements query building and optional Python analysis.
+
+## Main modules
+
+| File | Role |
+|------|------|
+| `query_resolver.py` | Click CLI, connection string parsing (`parse_connection_string`), and `process_query()` (orchestrates DB connection, registry lookup, execution, output). |
+| `local_processing.py` | `LOCAL_PROCESSING_CLASSES` registry and concrete analysis classes (Mean, Variance, etc.). |
+| `local_processing_base.py` | `BaseLocalProcessing` abstract base class (query building, optional Python analysis hook). |
+| `Dockerfile` | Builds the image that runs this code (Python 3.12, dependencies, entrypoint `query_resolver.py`). |
+
+## Database connection
+
+- If `--db-connection` is **not** provided, the connection string is built from environment variables: `postgresUsername`, `postgresPassword`, `postgresServer`, `postgresPort`, `postgresDatabase` (see `validate_environment()` and `parse_connection_string(None)` in `query_resolver.py`). This is the normal case when the container is launched by TES with env set by the task.
+- If `--db-connection` is provided, it can be a SQLAlchemy-style URL (`postgresql://...`) or a semicolon-separated key=value string (`Host=...;Username=...;...`).
+
+## Building and running
+
+From the repo root or this directory, build the image (see project docs or `tests/` for the exact image name and test usage). The container expects either postgres* env vars or `--db-connection`, plus `--user-query`, `--analysis`, and optional output options.
+
+For the **bunny**-based workflow (different image and entrypoint), see `bunny-wrapper/`.