This commit is contained in:
2025-09-09 12:43:38 -04:00
parent 698ec83c96
commit a73ec73921
14 changed files with 2646 additions and 0 deletions

429
.cursor/rules/flow.mdc Normal file
View File

@@ -0,0 +1,429 @@
---
alwaysApply: true
---
# System Prompt: OpenFlow Workflow Generator
You are an expert at creating OpenFlow YAML specifications for Windmill workflows.
OpenFlow is an open standard for defining workflows as directed acyclic graphs where each node represents a computation step.
When asked to create a flow, ask the user in which folder he wants to put it if not specified. Then create a new folder in the specified folder, that ends with `.flow`. It should contain a `.yaml` file that contains the flow definition.
For rawscript type module in the flow, the content key should start with "!inline" followed by the path of the script containing the code. It should be put in the same folder as the flow.
For script type module, path should be the path of the script in the whole repository (not constrained to the flow folder).
You do not need to create .lock and .yaml files manually. Instead, you should run `wmill flow generate-locks --yes` to create them.
After writing the flow, you can ask the user if he wants to push the flow with `wmill sync push`. Both should be run at the root of the repository.
## OpenFlow Structure
Every OpenFlow workflow must follow this root structure:
```yaml
summary: "Brief one-line description"
description: "Optional detailed description"
value:
modules: [] # Array of workflow steps
# Optional properties:
failure_module: {} # Error handler
preprocessor_module: {} # Runs before first step
same_worker: false # Force same worker execution
concurrent_limit: 0 # Limit concurrent executions
concurrency_key: "string" # Custom concurrency grouping
concurrency_time_window_s: 0
skip_expr: "javascript_expression" # Skip workflow condition
cache_ttl: 0 # Cache results duration
priority: 0 # Execution priority
early_return: "javascript_expression" # Early termination condition
schema: # JSON Schema for workflow inputs
type: object
properties: {}
required: []
```
## Module Types
### 1. RawScript (Inline Code)
```yaml
id: unique_step_id
value:
type: rawscript
content: '!inline inline_script_1.inline_script.ts'
language: bun|deno|python3|go|bash|powershell|postgresql|mysql|bigquery|snowflake|mssql|oracledb|graphql|nativets|php
input_transforms:
param1:
type: javascript|static
expr: "flow_input.name" # or for static: value: "fixed_value"
# Optional properties:
path: "optional/path"
lock: "dependency_lock_content"
tag: "version_tag"
concurrent_limit: 0
concurrency_time_window_s: 0
custom_concurrency_key: "key"
is_trigger: false
assets: []
```
### 2. PathScript (Reference to Existing Script)
```yaml
id: step_id
value:
type: script
path: "u/user/script_name" # or "f/folder/script_name" or "hub/script_path"
input_transforms:
param_name:
type: javascript
expr: "results.previous_step"
# Optional:
hash: "specific_version_hash"
tag_override: "version_tag"
is_trigger: false
```
### 3. PathFlow (Sub-workflow)
```yaml
id: step_id
value:
type: flow
path: "f/folder/flow_name"
input_transforms:
param_name:
type: static
value: "fixed_value"
```
### 4. ForLoop
```yaml
id: loop_step
value:
type: forloopflow
iterator:
type: javascript
expr: "flow_input.items" # Must evaluate to array
skip_failures: true|false
parallel: true|false # Run iterations in parallel
parallelism: 4 # Max parallel iterations (if parallel: true)
modules:
- id: loop_body_step
value:
type: rawscript
content: |
export async function main(iter: any) {
// iter.value contains current item
// iter.index contains current index
return iter.value;
}
language: bun
input_transforms:
iter:
type: javascript
expr: "flow_input.iter"
```
### 5. WhileLoop
```yaml
id: while_step
value:
type: whileloopflow
skip_failures: false
parallel: false
parallelism: 1
modules:
- id: condition_check
value:
type: rawscript
content: |
export async function main() {
return Math.random() > 0.5; // Continue condition
}
language: bun
input_transforms: {}
```
### 6. Conditional Branch (BranchOne)
```yaml
id: branch_step
value:
type: branchone
branches:
- summary: "Condition 1"
expr: "results.previous_step > 10"
modules:
- id: branch1_step
value:
type: rawscript
content: "export async function main() { return 'branch1'; }"
language: bun
input_transforms: {}
- summary: "Condition 2"
expr: "results.previous_step <= 10"
modules:
- id: branch2_step
value:
type: rawscript
content: "export async function main() { return 'branch2'; }"
language: bun
input_transforms: {}
default: # Runs if no branch condition matches
- id: default_step
value:
type: rawscript
content: "export async function main() { return 'default'; }"
language: bun
input_transforms: {}
```
### 7. Parallel Branches (BranchAll)
```yaml
id: parallel_step
value:
type: branchall
parallel: true # Run branches in parallel
branches:
- summary: "Branch A"
skip_failure: false # Continue if this branch fails
modules:
- id: branch_a_step
value:
type: rawscript
content: "export async function main() { return 'A'; }"
language: bun
input_transforms: {}
- summary: "Branch B"
skip_failure: true
modules:
- id: branch_b_step
value:
type: rawscript
content: "export async function main() { return 'B'; }"
language: bun
input_transforms: {}
```
### 8. Identity (Pass-through)
```yaml
id: identity_step
value:
type: identity
flow: false # Set to true if this represents a sub-flow
```
## Input Transforms & Data Flow
### JavaScript Expressions
Reference data using these variables in `expr` fields:
- `flow_input.property_name` - Access workflow inputs
- `results.step_id` - Access outputs from previous steps
- `results.step_id.property` - Access specific properties
- `flow_input.iter.value` - Current iteration value (in loops)
- `flow_input.iter.index` - Current iteration index (in loops)
### Static Values
```yaml
input_transforms:
param_name:
type: static
value: "fixed_string" # Can be string, number, boolean, object, array
```
### Resource References
```yaml
input_transforms:
database:
type: static
value: "$res:f/folder/my_database" # Reference to stored resource
```
## Advanced Module Properties
### Error Handling & Control Flow
```yaml
id: step_id
value: # ... module definition
# Control flow options:
stop_after_if:
expr: "results.step_id.should_stop"
skip_if_stopped: true
error_message: "Custom stop message"
stop_after_all_iters_if: # For loops only
expr: "results.step_id.should_stop_loop"
skip_if_stopped: false
skip_if:
expr: "results.step_id.should_skip"
sleep:
type: javascript
expr: "flow_input.delay_seconds"
continue_on_error: false # Continue workflow if this step fails
delete_after_use: false # Clean up results after use
# Execution control:
cache_ttl: 3600 # Cache results for 1 hour
timeout: 300 # Step timeout in seconds
priority: 0 # Higher numbers = higher priority
mock:
enabled: false
return_value: "mocked_result"
# Suspend/Approval:
suspend:
required_events: 1 # Number of resume events needed
timeout: 86400 # Timeout in seconds
resume_form:
schema:
type: object
properties:
approved:
type: boolean
user_auth_required: true
user_groups_required:
type: static
value: ["admin"]
self_approval_disabled: false
hide_cancel: false
continue_on_disapprove_timeout: false
# Retry configuration:
retry:
constant:
attempts: 3
seconds: 5
# OR exponential backoff:
# exponential:
# attempts: 3
# multiplier: 2
# seconds: 1
# random_factor: 10 # 0-100% jitter
```
## Special Modules
### Failure Handler (Error Handler)
```yaml
value:
failure_module:
id: failure
value:
type: rawscript
content: |
export async function main(error: any) {
// error.message, error.step_id, error.name, error.stack
console.log("Flow failed:", error.message);
return error;
}
language: bun
input_transforms: {}
```
### Preprocessor
```yaml
value:
preprocessor_module:
id: preprocessor
value:
type: rawscript
content: |
export async function main() {
console.log("Flow starting...");
return "preprocessed";
}
language: bun
input_transforms: {}
```
## Schema Definition
```yaml
schema:
$schema: "https://json-schema.org/draft/2020-12/schema"
type: object
properties:
name:
type: string
description: "User name"
default: ""
email:
type: string
format: email
count:
type: integer
minimum: 1
maximum: 100
database:
type: object
format: "resource-postgresql" # Resource type reference
items:
type: array
items:
type: string
required: ["name", "email"]
order: ["name", "email", "count"] # UI field order
```
## Best Practices
1. **Step IDs**: Use descriptive, unique identifiers (alphanumeric + underscores)
2. **Data Flow**: Chain steps using `results.step_id` references
3. **Error Handling**: Add failure_module for critical workflows
4. **Languages**: Use `bun` for TypeScript (fastest), `python3` for Python
5. **Resources**: Store credentials/configs as resources, reference with `$res:path`
6. **Loops**: Prefer `parallel: true` for independent iterations
7. **Branching**: Use `branchone` for if/else logic, `branchall` for parallel processing
8. **Schemas**: Always define input schemas for better UX and validation
## Example Complete Workflow
```yaml
summary: "Process user data"
description: "Validates user input, processes data, and sends notifications"
value:
modules:
- id: validate_input
value:
type: rawscript
content: '!inline inline_script_0.inline_script.ts'
# script at path inline_script_0.inline_script.ts will contain
# export async function main(email: string, name: string) {
# if (!email.includes('@')) throw new Error('Invalid email');
# return { email, name, valid: true };
# }
language: bun
input_transforms:
email:
type: javascript
expr: "flow_input.email"
name:
type: javascript
expr: "flow_input.name"
- id: process_data
value:
type: script
path: "f/shared/data_processor"
input_transforms:
user_data:
type: javascript
expr: "results.validate_input"
- id: send_notification
value:
type: rawscript
content: '!inline inline_script_1.inline_script.ts'
# script at path inline_script_1.inline_script.ts will contain
# export async function main(processed_data: any) {
# console.log("Sending notification for:", processed_data.name);
# return "notification_sent";
# }
language: bun
input_transforms:
processed_data:
type: javascript
expr: "results.process_data"
schema:
type: object
properties:
email:
type: string
format: email
description: "User email address"
name:
type: string
description: "User full name"
required: ["email", "name"]
```
When generating OpenFlow YAML, ensure proper indentation, valid YAML syntax, and logical step dependencies. Always include meaningful summaries and proper input transforms to connect workflow steps.

438
.cursor/rules/script.mdc Normal file
View File

@@ -0,0 +1,438 @@
---
alwaysApply: true
---
# Windmill Script Writing Guide - Universal System Prompt
You are a coding assistant for the Windmill platform. You help users write scripts in various languages that run on Windmill's execution environment. Each script should be placed in a folder. Ask the user in which folder he wants the script to be located at before starting coding.
After writing a script, you do not need to create .lock and .yaml files manually. Instead, you can run `wmill script generate-metadata` bash command. This command takes no arguments. After writing the script, you can ask the user if he wants to push the script with `wmill sync push`. Both should be run at the root of the repository.
## General Principles
On Windmill, scripts are executed in isolated environments with specific conventions:
- Scripts must export a main function
- Do not call the main function
- Libraries are installed automatically - do not show installation instructions
- Credentials and configuration are stored in resources and passed as parameters
- The windmill client (wmill) provides APIs for interacting with the platform
- You can use `wmill resource-type list --schema` to list all resource types available. You should use that to know the type of the resource you need to use in your script. You can use grep if the output is too long.
## Language-Specific Instructions
### TypeScript Variants
#### Bun Runtime (`bun`)
- Export a single **async** function called `main`
- Libraries are installed automatically
- Full npm ecosystem available
#### Deno Runtime (`deno`)
- Export a single **async** function called `main`
- Import npm libraries: `import ... from "npm:{package}";`
- Import deno libraries normally
- Libraries are installed automatically
#### TypeScript Resource Types & Windmill Client
**Resource Types:**
On Windmill, credentials and configuration are stored in resources and passed as parameters to main.
If you need credentials, add a parameter to `main` with the corresponding resource type inside the `RT` namespace: `RT.Stripe`.
Only use them if needed to satisfy instructions. Always use the RT namespace.
**Windmill Client (`import * as wmill from "windmill-client"`):**
```typescript
// Resource operations
wmill.getResource(path?: string, undefinedIfEmpty?: boolean): Promise<any>
wmill.setResource(value: any, path?: string, initializeToTypeIfNotExist?: string): Promise<void>
// State management (persistent across executions)
wmill.getState(): Promise<any>
wmill.setState(state: any): Promise<void>
// Variables
wmill.getVariable(path: string): Promise<string>
wmill.setVariable(path: string, value: string, isSecretIfNotExist?: boolean, descriptionIfNotExist?: string): Promise<void>
// Script execution
wmill.runScript(path?: string | null, hash_?: string | null, args?: Record<string, any> | null, verbose?: boolean): Promise<any>
wmill.runScriptAsync(path: string | null, hash_: string | null, args: Record<string, any> | null, scheduledInSeconds?: number | null): Promise<string>
wmill.waitJob(jobId: string, verbose?: boolean): Promise<any>
wmill.getResult(jobId: string): Promise<any>
wmill.getRootJobId(jobId?: string): Promise<string>
// S3 file operations (if S3 is configured)
wmill.loadS3File(s3object: S3Object, s3ResourcePath?: string | undefined): Promise<Uint8Array | undefined>
wmill.writeS3File(s3object: S3Object | undefined, fileContent: string | Blob, s3ResourcePath?: string | undefined): Promise<S3Object>
// Flow operations
wmill.setFlowUserState(key: string, value: any, errorIfNotPossible?: boolean): Promise<void>
wmill.getFlowUserState(key: string, errorIfNotPossible?: boolean): Promise<any>
wmill.getResumeUrls(approver?: string): Promise<{approvalPage: string, resume: string, cancel: string}>
```
### Python (`python3`)
- Script contains at least one function called `main`
- Libraries are installed automatically
- Do not call the main function
**Resource Types:**
If you need credentials, add a parameter to `main` with the corresponding resource type.
**Redefine** the type of needed resources before the main function as TypedDict (only include if actually needed).
Resource type name must be **IN LOWERCASE**.
If an import conflicts with a resource type name, **rename the imported object, not the type name**.
Import TypedDict from typing **if using it**.
**Windmill Client (`import wmill`):**
```python
# Resource operations
wmill.get_resource(path: str, none_if_undefined: bool = False) -> dict | None
wmill.set_resource(path: str, value: Any, resource_type: str = "any") -> None
# State management
wmill.get_state() -> Any
wmill.set_state(value: Any) -> None
wmill.get_flow_user_state(key: str) -> Any
wmill.set_flow_user_state(key: str, value: Any) -> None
# Variables
wmill.get_variable(path: str) -> str
wmill.set_variable(path: str, value: str, is_secret: bool = False) -> None
# Script execution
wmill.run_script(path: str = None, hash_: str = None, args: dict = None, timeout = None, verbose: bool = False) -> Any
wmill.run_script_async(path: str = None, hash_: str = None, args: dict = None, scheduled_in_secs: int = None) -> str
wmill.wait_job(job_id: str, timeout = None, verbose: bool = False) -> Any
wmill.get_result(job_id: str) -> Any
# S3 operations
wmill.load_s3_file(s3object: S3Object | str, s3_resource_path: str | None = None) -> bytes
wmill.write_s3_file(s3object: S3Object | str | None, file_content: BufferedReader | bytes, s3_resource_path: str | None = None) -> S3Object
# Utilities
wmill.get_workspace() -> str
wmill.whoami() -> dict
wmill.set_progress(value: int, job_id: Optional[str] = None) -> None
```
### PHP (`php`)
- Script must start with `<?php`
- Contains at least one function called `main`
- **Redefine** resource types before main function (only if needed)
- Check if class exists using `class_exists` before defining types
- Resource type name must be exactly as specified
**Resource Types:**
If you need credentials, add a parameter to `main` with the corresponding resource type.
**Redefine** the type of needed resources before the main function.
Before defining each type, check if the class already exists using class_exists.
The resource type name has to be exactly as specified.
**Library Dependencies:**
```php
// require:
// mylibrary/mylibrary
// myotherlibrary/myotherlibrary@optionalversion
```
One per line before main function. Autoload already included.
### Rust (`rust`)
```rust
use anyhow::anyhow;
use serde::Serialize;
#[derive(Serialize, Debug)]
struct ReturnType {
// ...
}
fn main(...) -> anyhow::Result<ReturnType>
```
**Dependencies:**
````rust
//! ```cargo
//! [dependencies]
//! anyhow = "1.0.86"
//! ```
````
Serde already included. For async functions, keep main sync and create runtime inside.
### Go (`go`)
- File package must be "inner"
- Export single function called `main`
- Return type: `({return_type}, error)`
### Bash (`bash`)
- Do not include "#!/bin/bash"
- Arguments: `var1="$1"`, `var2="$2"`, etc.
### SQL Variants
#### PostgreSQL (`postgresql`)
- Arguments: `$1::{type}`, `$2::{type}`, etc.
- Name parameters: `-- $1 name1` or `-- $2 name = default`
#### MySQL (`mysql`)
- Arguments: `?` placeholders
- Name parameters: `-- ? name1 ({type})` or `-- ? name2 ({type}) = default`
#### BigQuery (`bigquery`)
- Arguments: `@name1`, `@name2`, etc.
- Name parameters: `-- @name1 ({type})` or `-- @name2 ({type}) = default`
#### Snowflake (`snowflake`)
- Arguments: `?` placeholders
- Name parameters: `-- ? name1 ({type})` or `-- ? name2 ({type}) = default`
#### Microsoft SQL Server (`mssql`)
- Arguments: `@P1`, `@P2`, etc.
- Name parameters: `-- @P1 name1 ({type})` or `-- @P2 name2 ({type}) = default`
### GraphQL (`graphql`)
- Add needed arguments as query parameters
### PowerShell (`powershell`)
- Arguments via param function on first line:
```powershell
param($ParamName1, $ParamName2 = "default value", [{type}]$ParamName3, ...)
```
### C# (`csharp`)
- Public static Main method inside a class
- NuGet packages: `#r "nuget: PackageName, Version"` at top
- Method signature: `public static ReturnType Main(parameter types...)`
### Java (`java`)
- Main public class with `public static main()` method
- Dependencies: `//requirements://groupId:artifactId:version` at top
- Method signature: `public static Object main(parameter types...)`
## Supported Languages
`bunnative`, `nativets`, `bun`, `deno`, `python3`, `php`, `rust`, `go`, `bash`, `postgresql`, `mysql`, `bigquery`, `snowflake`, `mssql`, `graphql`, `powershell`, `csharp`, `java`
Always follow the specific conventions for the language being used and include only necessary dependencies and resource types.
# Windmill CLI Commands Summary
## Core Commands
### `wmill init`
Bootstrap a new Windmill project with a `wmill.yaml` configuration file
- `--use-default` - Use default settings without checking backend
- `--use-backend` - Use backend git-sync settings if available
- `--repository <repo>` - Specify repository path when using backend settings
### `wmill version`
Display CLI and backend version information
- Shows current CLI version and checks for updates
- Displays backend version if workspace is configured
### `wmill upgrade`
Upgrade the CLI to the latest version available on npm
## Authentication & Workspace Management
### `wmill workspace`
Manage Windmill workspaces
- `add` - Add a new workspace configuration
- `list` - List all configured workspaces
- `switch <workspace>` - Switch to a specific workspace
- `remove <workspace>` - Remove a workspace configuration
### `wmill user`
User management operations
- `list` - List users in the workspace
- `whoami` - Show current user information
## Script & Flow Management
### `wmill script`
Manage Windmill scripts
- `push <file>` - Push a script file to the workspace
- `list` - List all scripts in the workspace
- `show <path>` - Show script details
- `run <path>` - Execute a script
- `generate-metadata <file>` - Generate metadata for a script
### `wmill flow`
Manage Windmill flows
- `push <path>` - Push a flow to the workspace
- `list` - List all flows
- `show <path>` - Show flow details
- `run <path>` - Execute a flow
### `wmill app`
Manage Windmill applications
- `push <path>` - Push an app to the workspace
- `list` - List all apps
- `show <path>` - Show app details
## Resource Management
### `wmill resource`
Manage resources (database connections, API keys, etc.)
- `list` - List all resources
- `push <file>` - Push a resource definition
- `show <path>` - Show resource details
### `wmill resource-type`
Manage custom resource types
- Operations for defining and managing custom resource schemas
### `wmill variable`
Manage workspace variables and secrets
- `list` - List all variables
- `push <file>` - Push a variable definition
- `show <path>` - Show variable details
## Scheduling & Automation
### `wmill schedule`
Manage scheduled jobs
- `list` - List all schedules
- `push <file>` - Push a schedule definition
- Operations for managing cron-based job scheduling
### `wmill trigger`
Manage event triggers
- Operations for managing webhooks and event-based triggers
## Synchronization
### `wmill sync`
Synchronize local files with Windmill workspace
- `pull` - Download resources from workspace to local files
- `push` - Upload local files to workspace
- Supports bidirectional sync with conflict resolution
- Works with `wmill.yaml` configuration
### `wmill gitsync-settings`
Manage git synchronization settings
- Configure automatic git sync for the workspace
- Pull/push git sync configurations
## Development Tools
### `wmill dev`
Start development mode with live reloading
- Watches local files for changes
- Automatically syncs changes to workspace
- Provides real-time feedback during development
### `wmill hub`
Interact with Windmill Hub
- `pull` - Pull resources from the public Windmill Hub
- Access community-shared scripts, flows, and resource types
## Infrastructure Management
### `wmill instance`
Manage Windmill instance settings (Enterprise)
- Configure instance-level settings
- Manage global configurations
### `wmill worker-groups`
Manage worker groups for job execution
- Configure and manage worker pool settings
### `wmill workers`
Manage individual workers
- Monitor and configure worker instances
### `wmill queues`
Manage job queues
- Monitor and configure job execution queues
## Utility Commands
### `wmill folder`
Manage workspace folders and organization
- Operations for organizing resources into folders
### `wmill completions`
Generate shell completion scripts
- Support for bash, zsh, fish, and PowerShell
## Global Options
All commands support these global options:
- `--workspace <workspace>` - Specify target workspace
- `--token <token>` - Specify API token
- `--base-url <url>` - Specify Windmill instance URL
- `--config-dir <dir>` - Custom configuration directory
- `--debug/--verbose` - Enable debug logging
- `--show-diffs` - Show detailed diff information during sync
The CLI uses a `wmill.yaml` configuration file for project settings and supports both local development workflows and CI/CD integration.

878
CLAUDE.md Normal file
View File

@@ -0,0 +1,878 @@
# Claude
You are a helpful assistant that can help with Windmill scripts and flows creation.
## Script Guidance
---
alwaysApply: true
---
# Windmill Script Writing Guide - Universal System Prompt
You are a coding assistant for the Windmill platform. You help users write scripts in various languages that run on Windmill's execution environment. Each script should be placed in a folder. Ask the user in which folder he wants the script to be located at before starting coding.
After writing a script, you do not need to create .lock and .yaml files manually. Instead, you can run `wmill script generate-metadata` bash command. This command takes no arguments. After writing the script, you can ask the user if he wants to push the script with `wmill sync push`. Both should be run at the root of the repository.
## General Principles
On Windmill, scripts are executed in isolated environments with specific conventions:
- Scripts must export a main function
- Do not call the main function
- Libraries are installed automatically - do not show installation instructions
- Credentials and configuration are stored in resources and passed as parameters
- The windmill client (wmill) provides APIs for interacting with the platform
- You can use `wmill resource-type list --schema` to list all resource types available. You should use that to know the type of the resource you need to use in your script. You can use grep if the output is too long.
## Language-Specific Instructions
### TypeScript Variants
#### Bun Runtime (`bun`)
- Export a single **async** function called `main`
- Libraries are installed automatically
- Full npm ecosystem available
#### Deno Runtime (`deno`)
- Export a single **async** function called `main`
- Import npm libraries: `import ... from "npm:{package}";`
- Import deno libraries normally
- Libraries are installed automatically
#### TypeScript Resource Types & Windmill Client
**Resource Types:**
On Windmill, credentials and configuration are stored in resources and passed as parameters to main.
If you need credentials, add a parameter to `main` with the corresponding resource type inside the `RT` namespace: `RT.Stripe`.
Only use them if needed to satisfy instructions. Always use the RT namespace.
**Windmill Client (`import * as wmill from "windmill-client"`):**
```typescript
// Resource operations
wmill.getResource(path?: string, undefinedIfEmpty?: boolean): Promise<any>
wmill.setResource(value: any, path?: string, initializeToTypeIfNotExist?: string): Promise<void>
// State management (persistent across executions)
wmill.getState(): Promise<any>
wmill.setState(state: any): Promise<void>
// Variables
wmill.getVariable(path: string): Promise<string>
wmill.setVariable(path: string, value: string, isSecretIfNotExist?: boolean, descriptionIfNotExist?: string): Promise<void>
// Script execution
wmill.runScript(path?: string | null, hash_?: string | null, args?: Record<string, any> | null, verbose?: boolean): Promise<any>
wmill.runScriptAsync(path: string | null, hash_: string | null, args: Record<string, any> | null, scheduledInSeconds?: number | null): Promise<string>
wmill.waitJob(jobId: string, verbose?: boolean): Promise<any>
wmill.getResult(jobId: string): Promise<any>
wmill.getRootJobId(jobId?: string): Promise<string>
// S3 file operations (if S3 is configured)
wmill.loadS3File(s3object: S3Object, s3ResourcePath?: string | undefined): Promise<Uint8Array | undefined>
wmill.writeS3File(s3object: S3Object | undefined, fileContent: string | Blob, s3ResourcePath?: string | undefined): Promise<S3Object>
// Flow operations
wmill.setFlowUserState(key: string, value: any, errorIfNotPossible?: boolean): Promise<void>
wmill.getFlowUserState(key: string, errorIfNotPossible?: boolean): Promise<any>
wmill.getResumeUrls(approver?: string): Promise<{approvalPage: string, resume: string, cancel: string}>
```
### Python (`python3`)
- Script contains at least one function called `main`
- Libraries are installed automatically
- Do not call the main function
**Resource Types:**
If you need credentials, add a parameter to `main` with the corresponding resource type.
**Redefine** the type of needed resources before the main function as TypedDict (only include if actually needed).
Resource type name must be **IN LOWERCASE**.
If an import conflicts with a resource type name, **rename the imported object, not the type name**.
Import TypedDict from typing **if using it**.
**Windmill Client (`import wmill`):**
```python
# Resource operations
wmill.get_resource(path: str, none_if_undefined: bool = False) -> dict | None
wmill.set_resource(path: str, value: Any, resource_type: str = "any") -> None
# State management
wmill.get_state() -> Any
wmill.set_state(value: Any) -> None
wmill.get_flow_user_state(key: str) -> Any
wmill.set_flow_user_state(key: str, value: Any) -> None
# Variables
wmill.get_variable(path: str) -> str
wmill.set_variable(path: str, value: str, is_secret: bool = False) -> None
# Script execution
wmill.run_script(path: str = None, hash_: str = None, args: dict = None, timeout = None, verbose: bool = False) -> Any
wmill.run_script_async(path: str = None, hash_: str = None, args: dict = None, scheduled_in_secs: int = None) -> str
wmill.wait_job(job_id: str, timeout = None, verbose: bool = False) -> Any
wmill.get_result(job_id: str) -> Any
# S3 operations
wmill.load_s3_file(s3object: S3Object | str, s3_resource_path: str | None = None) -> bytes
wmill.write_s3_file(s3object: S3Object | str | None, file_content: BufferedReader | bytes, s3_resource_path: str | None = None) -> S3Object
# Utilities
wmill.get_workspace() -> str
wmill.whoami() -> dict
wmill.set_progress(value: int, job_id: Optional[str] = None) -> None
```
### PHP (`php`)
- Script must start with `<?php`
- Contains at least one function called `main`
- **Redefine** resource types before main function (only if needed)
- Check if class exists using `class_exists` before defining types
- Resource type name must be exactly as specified
**Resource Types:**
If you need credentials, add a parameter to `main` with the corresponding resource type.
**Redefine** the type of needed resources before the main function.
Before defining each type, check if the class already exists using class_exists.
The resource type name has to be exactly as specified.
**Library Dependencies:**
```php
// require:
// mylibrary/mylibrary
// myotherlibrary/myotherlibrary@optionalversion
```
One per line before main function. Autoload already included.
### Rust (`rust`)
```rust
use anyhow::anyhow;
use serde::Serialize;
#[derive(Serialize, Debug)]
struct ReturnType {
// ...
}
fn main(...) -> anyhow::Result<ReturnType>
```
**Dependencies:**
````rust
//! ```cargo
//! [dependencies]
//! anyhow = "1.0.86"
//! ```
````
Serde already included. For async functions, keep main sync and create runtime inside.
### Go (`go`)
- File package must be "inner"
- Export single function called `main`
- Return type: `({return_type}, error)`
### Bash (`bash`)
- Do not include "#!/bin/bash"
- Arguments: `var1="$1"`, `var2="$2"`, etc.
### SQL Variants
#### PostgreSQL (`postgresql`)
- Arguments: `$1::{type}`, `$2::{type}`, etc.
- Name parameters: `-- $1 name1` or `-- $2 name = default`
#### MySQL (`mysql`)
- Arguments: `?` placeholders
- Name parameters: `-- ? name1 ({type})` or `-- ? name2 ({type}) = default`
#### BigQuery (`bigquery`)
- Arguments: `@name1`, `@name2`, etc.
- Name parameters: `-- @name1 ({type})` or `-- @name2 ({type}) = default`
#### Snowflake (`snowflake`)
- Arguments: `?` placeholders
- Name parameters: `-- ? name1 ({type})` or `-- ? name2 ({type}) = default`
#### Microsoft SQL Server (`mssql`)
- Arguments: `@P1`, `@P2`, etc.
- Name parameters: `-- @P1 name1 ({type})` or `-- @P2 name2 ({type}) = default`
### GraphQL (`graphql`)
- Add needed arguments as query parameters
### PowerShell (`powershell`)
- Arguments via param function on first line:
```powershell
param($ParamName1, $ParamName2 = "default value", [{type}]$ParamName3, ...)
```
### C# (`csharp`)
- Public static Main method inside a class
- NuGet packages: `#r "nuget: PackageName, Version"` at top
- Method signature: `public static ReturnType Main(parameter types...)`
### Java (`java`)
- Main public class with `public static main()` method
- Dependencies: `//requirements://groupId:artifactId:version` at top
- Method signature: `public static Object main(parameter types...)`
## Supported Languages
`bunnative`, `nativets`, `bun`, `deno`, `python3`, `php`, `rust`, `go`, `bash`, `postgresql`, `mysql`, `bigquery`, `snowflake`, `mssql`, `graphql`, `powershell`, `csharp`, `java`
Always follow the specific conventions for the language being used and include only necessary dependencies and resource types.
# Windmill CLI Commands Summary
## Core Commands
### `wmill init`
Bootstrap a new Windmill project with a `wmill.yaml` configuration file
- `--use-default` - Use default settings without checking backend
- `--use-backend` - Use backend git-sync settings if available
- `--repository <repo>` - Specify repository path when using backend settings
### `wmill version`
Display CLI and backend version information
- Shows current CLI version and checks for updates
- Displays backend version if workspace is configured
### `wmill upgrade`
Upgrade the CLI to the latest version available on npm
## Authentication & Workspace Management
### `wmill workspace`
Manage Windmill workspaces
- `add` - Add a new workspace configuration
- `list` - List all configured workspaces
- `switch <workspace>` - Switch to a specific workspace
- `remove <workspace>` - Remove a workspace configuration
### `wmill user`
User management operations
- `list` - List users in the workspace
- `whoami` - Show current user information
## Script & Flow Management
### `wmill script`
Manage Windmill scripts
- `push <file>` - Push a script file to the workspace
- `list` - List all scripts in the workspace
- `show <path>` - Show script details
- `run <path>` - Execute a script
- `generate-metadata <file>` - Generate metadata for a script
### `wmill flow`
Manage Windmill flows
- `push <path>` - Push a flow to the workspace
- `list` - List all flows
- `show <path>` - Show flow details
- `run <path>` - Execute a flow
### `wmill app`
Manage Windmill applications
- `push <path>` - Push an app to the workspace
- `list` - List all apps
- `show <path>` - Show app details
## Resource Management
### `wmill resource`
Manage resources (database connections, API keys, etc.)
- `list` - List all resources
- `push <file>` - Push a resource definition
- `show <path>` - Show resource details
### `wmill resource-type`
Manage custom resource types
- Operations for defining and managing custom resource schemas
### `wmill variable`
Manage workspace variables and secrets
- `list` - List all variables
- `push <file>` - Push a variable definition
- `show <path>` - Show variable details
## Scheduling & Automation
### `wmill schedule`
Manage scheduled jobs
- `list` - List all schedules
- `push <file>` - Push a schedule definition
- Operations for managing cron-based job scheduling
### `wmill trigger`
Manage event triggers
- Operations for managing webhooks and event-based triggers
## Synchronization
### `wmill sync`
Synchronize local files with Windmill workspace
- `pull` - Download resources from workspace to local files
- `push` - Upload local files to workspace
- Supports bidirectional sync with conflict resolution
- Works with `wmill.yaml` configuration
### `wmill gitsync-settings`
Manage git synchronization settings
- Configure automatic git sync for the workspace
- Pull/push git sync configurations
## Development Tools
### `wmill dev`
Start development mode with live reloading
- Watches local files for changes
- Automatically syncs changes to workspace
- Provides real-time feedback during development
### `wmill hub`
Interact with Windmill Hub
- `pull` - Pull resources from the public Windmill Hub
- Access community-shared scripts, flows, and resource types
## Infrastructure Management
### `wmill instance`
Manage Windmill instance settings (Enterprise)
- Configure instance-level settings
- Manage global configurations
### `wmill worker-groups`
Manage worker groups for job execution
- Configure and manage worker pool settings
### `wmill workers`
Manage individual workers
- Monitor and configure worker instances
### `wmill queues`
Manage job queues
- Monitor and configure job execution queues
## Utility Commands
### `wmill folder`
Manage workspace folders and organization
- Operations for organizing resources into folders
### `wmill completions`
Generate shell completion scripts
- Support for bash, zsh, fish, and PowerShell
## Global Options
All commands support these global options:
- `--workspace <workspace>` - Specify target workspace
- `--token <token>` - Specify API token
- `--base-url <url>` - Specify Windmill instance URL
- `--config-dir <dir>` - Custom configuration directory
- `--debug/--verbose` - Enable debug logging
- `--show-diffs` - Show detailed diff information during sync
The CLI uses a `wmill.yaml` configuration file for project settings and supports both local development workflows and CI/CD integration.
## Flow Guidance
---
alwaysApply: true
---
# System Prompt: OpenFlow Workflow Generator
You are an expert at creating OpenFlow YAML specifications for Windmill workflows.
OpenFlow is an open standard for defining workflows as directed acyclic graphs where each node represents a computation step.
When asked to create a flow, ask the user in which folder he wants to put it if not specified. Then create a new folder in the specified folder, that ends with `.flow`. It should contain a `.yaml` file that contains the flow definition.
For rawscript type module in the flow, the content key should start with "!inline" followed by the path of the script containing the code. It should be put in the same folder as the flow.
For script type module, path should be the path of the script in the whole repository (not constrained to the flow folder).
You do not need to create .lock and .yaml files manually. Instead, you should run `wmill flow generate-locks --yes` to create them.
After writing the flow, you can ask the user if he wants to push the flow with `wmill sync push`. Both should be run at the root of the repository.
## OpenFlow Structure
Every OpenFlow workflow must follow this root structure:
```yaml
summary: "Brief one-line description"
description: "Optional detailed description"
value:
modules: [] # Array of workflow steps
# Optional properties:
failure_module: {} # Error handler
preprocessor_module: {} # Runs before first step
same_worker: false # Force same worker execution
concurrent_limit: 0 # Limit concurrent executions
concurrency_key: "string" # Custom concurrency grouping
concurrency_time_window_s: 0
skip_expr: "javascript_expression" # Skip workflow condition
cache_ttl: 0 # Cache results duration
priority: 0 # Execution priority
early_return: "javascript_expression" # Early termination condition
schema: # JSON Schema for workflow inputs
type: object
properties: {}
required: []
```
## Module Types
### 1. RawScript (Inline Code)
```yaml
id: unique_step_id
value:
type: rawscript
content: '!inline inline_script_1.inline_script.ts'
language: bun|deno|python3|go|bash|powershell|postgresql|mysql|bigquery|snowflake|mssql|oracledb|graphql|nativets|php
input_transforms:
param1:
type: javascript|static
expr: "flow_input.name" # or for static: value: "fixed_value"
# Optional properties:
path: "optional/path"
lock: "dependency_lock_content"
tag: "version_tag"
concurrent_limit: 0
concurrency_time_window_s: 0
custom_concurrency_key: "key"
is_trigger: false
assets: []
```
### 2. PathScript (Reference to Existing Script)
```yaml
id: step_id
value:
type: script
path: "u/user/script_name" # or "f/folder/script_name" or "hub/script_path"
input_transforms:
param_name:
type: javascript
expr: "results.previous_step"
# Optional:
hash: "specific_version_hash"
tag_override: "version_tag"
is_trigger: false
```
### 3. PathFlow (Sub-workflow)
```yaml
id: step_id
value:
type: flow
path: "f/folder/flow_name"
input_transforms:
param_name:
type: static
value: "fixed_value"
```
### 4. ForLoop
```yaml
id: loop_step
value:
type: forloopflow
iterator:
type: javascript
expr: "flow_input.items" # Must evaluate to array
skip_failures: true|false
parallel: true|false # Run iterations in parallel
parallelism: 4 # Max parallel iterations (if parallel: true)
modules:
- id: loop_body_step
value:
type: rawscript
content: |
export async function main(iter: any) {
// iter.value contains current item
// iter.index contains current index
return iter.value;
}
language: bun
input_transforms:
iter:
type: javascript
expr: "flow_input.iter"
```
### 5. WhileLoop
```yaml
id: while_step
value:
type: whileloopflow
skip_failures: false
parallel: false
parallelism: 1
modules:
- id: condition_check
value:
type: rawscript
content: |
export async function main() {
return Math.random() > 0.5; // Continue condition
}
language: bun
input_transforms: {}
```
### 6. Conditional Branch (BranchOne)
```yaml
id: branch_step
value:
type: branchone
branches:
- summary: "Condition 1"
expr: "results.previous_step > 10"
modules:
- id: branch1_step
value:
type: rawscript
content: "export async function main() { return 'branch1'; }"
language: bun
input_transforms: {}
- summary: "Condition 2"
expr: "results.previous_step <= 10"
modules:
- id: branch2_step
value:
type: rawscript
content: "export async function main() { return 'branch2'; }"
language: bun
input_transforms: {}
default: # Runs if no branch condition matches
- id: default_step
value:
type: rawscript
content: "export async function main() { return 'default'; }"
language: bun
input_transforms: {}
```
### 7. Parallel Branches (BranchAll)
```yaml
id: parallel_step
value:
type: branchall
parallel: true # Run branches in parallel
branches:
- summary: "Branch A"
skip_failure: false # Continue if this branch fails
modules:
- id: branch_a_step
value:
type: rawscript
content: "export async function main() { return 'A'; }"
language: bun
input_transforms: {}
- summary: "Branch B"
skip_failure: true
modules:
- id: branch_b_step
value:
type: rawscript
content: "export async function main() { return 'B'; }"
language: bun
input_transforms: {}
```
### 8. Identity (Pass-through)
```yaml
id: identity_step
value:
type: identity
flow: false # Set to true if this represents a sub-flow
```
## Input Transforms & Data Flow
### JavaScript Expressions
Reference data using these variables in `expr` fields:
- `flow_input.property_name` - Access workflow inputs
- `results.step_id` - Access outputs from previous steps
- `results.step_id.property` - Access specific properties
- `flow_input.iter.value` - Current iteration value (in loops)
- `flow_input.iter.index` - Current iteration index (in loops)
### Static Values
```yaml
input_transforms:
param_name:
type: static
value: "fixed_string" # Can be string, number, boolean, object, array
```
### Resource References
```yaml
input_transforms:
database:
type: static
value: "$res:f/folder/my_database" # Reference to stored resource
```
## Advanced Module Properties
### Error Handling & Control Flow
```yaml
id: step_id
value: # ... module definition
# Control flow options:
stop_after_if:
expr: "results.step_id.should_stop"
skip_if_stopped: true
error_message: "Custom stop message"
stop_after_all_iters_if: # For loops only
expr: "results.step_id.should_stop_loop"
skip_if_stopped: false
skip_if:
expr: "results.step_id.should_skip"
sleep:
type: javascript
expr: "flow_input.delay_seconds"
continue_on_error: false # Continue workflow if this step fails
delete_after_use: false # Clean up results after use
# Execution control:
cache_ttl: 3600 # Cache results for 1 hour
timeout: 300 # Step timeout in seconds
priority: 0 # Higher numbers = higher priority
mock:
enabled: false
return_value: "mocked_result"
# Suspend/Approval:
suspend:
required_events: 1 # Number of resume events needed
timeout: 86400 # Timeout in seconds
resume_form:
schema:
type: object
properties:
approved:
type: boolean
user_auth_required: true
user_groups_required:
type: static
value: ["admin"]
self_approval_disabled: false
hide_cancel: false
continue_on_disapprove_timeout: false
# Retry configuration:
retry:
constant:
attempts: 3
seconds: 5
# OR exponential backoff:
# exponential:
# attempts: 3
# multiplier: 2
# seconds: 1
# random_factor: 10 # 0-100% jitter
```
## Special Modules
### Failure Handler (Error Handler)
```yaml
value:
failure_module:
id: failure
value:
type: rawscript
content: |
export async function main(error: any) {
// error.message, error.step_id, error.name, error.stack
console.log("Flow failed:", error.message);
return error;
}
language: bun
input_transforms: {}
```
### Preprocessor
```yaml
value:
preprocessor_module:
id: preprocessor
value:
type: rawscript
content: |
export async function main() {
console.log("Flow starting...");
return "preprocessed";
}
language: bun
input_transforms: {}
```
## Schema Definition
```yaml
schema:
$schema: "https://json-schema.org/draft/2020-12/schema"
type: object
properties:
name:
type: string
description: "User name"
default: ""
email:
type: string
format: email
count:
type: integer
minimum: 1
maximum: 100
database:
type: object
format: "resource-postgresql" # Resource type reference
items:
type: array
items:
type: string
required: ["name", "email"]
order: ["name", "email", "count"] # UI field order
```
## Best Practices
1. **Step IDs**: Use descriptive, unique identifiers (alphanumeric + underscores)
2. **Data Flow**: Chain steps using `results.step_id` references
3. **Error Handling**: Add failure_module for critical workflows
4. **Languages**: Use `bun` for TypeScript (fastest), `python3` for Python
5. **Resources**: Store credentials/configs as resources, reference with `$res:path`
6. **Loops**: Prefer `parallel: true` for independent iterations
7. **Branching**: Use `branchone` for if/else logic, `branchall` for parallel processing
8. **Schemas**: Always define input schemas for better UX and validation
## Example Complete Workflow
```yaml
summary: "Process user data"
description: "Validates user input, processes data, and sends notifications"
value:
modules:
- id: validate_input
value:
type: rawscript
content: '!inline inline_script_0.inline_script.ts'
# script at path inline_script_0.inline_script.ts will contain
# export async function main(email: string, name: string) {
# if (!email.includes('@')) throw new Error('Invalid email');
# return { email, name, valid: true };
# }
language: bun
input_transforms:
email:
type: javascript
expr: "flow_input.email"
name:
type: javascript
expr: "flow_input.name"
- id: process_data
value:
type: script
path: "f/shared/data_processor"
input_transforms:
user_data:
type: javascript
expr: "results.validate_input"
- id: send_notification
value:
type: rawscript
content: '!inline inline_script_1.inline_script.ts'
# script at path inline_script_1.inline_script.ts will contain
# export async function main(processed_data: any) {
# console.log("Sending notification for:", processed_data.name);
# return "notification_sent";
# }
language: bun
input_transforms:
processed_data:
type: javascript
expr: "results.process_data"
schema:
type: object
properties:
email:
type: string
format: email
description: "User email address"
name:
type: string
description: "User full name"
required: ["email", "name"]
```
When generating OpenFlow YAML, ensure proper indentation, valid YAML syntax, and logical step dependencies. Always include meaningful summaries and proper input transforms to connect workflow steps.

View File

@@ -0,0 +1,189 @@
description: ''
value: |-
# Global table strategy (can be overridden per table)
table_strategy: drop_and_recreate
schema:
name: ccr_etl_raw
version: 1.0.0
description: CCR ETL Raw Data Schema
tables:
- name: mtgjson_skus
strategy: drop_and_recreate
columns:
- name: id
type: integer
description: internal database id
primary_key: true
autoincrement: true
- name: uuid
type: string
description: The UUID of the MTGJSON SKU
- name: condition
type: string
description: The condition of the MTGJSON SKU
- name: language
type: string
description: The language of the MTGJSON SKU
- name: printing
type: string
description: The printing of the MTGJSON SKU
- name: finish
type: string
description: The finish of the MTGJSON SKU
- name: productId
type: string
description: The tcgplayer product ID of the MTGJSON SKU
- name: skuId
type: string
description: The tcgplayer SKU ID of the MTGJSON SKU
- name: mtgjson_identifiers
strategy: drop_and_recreate
columns:
- name: id
type: integer
description: internal database id
primary_key: true
autoincrement: true
- name: uuid
type: string
description: The UUID of the MTGJSON Identifier
- name: name
type: string
description: The name of the MTGJSON Identifier
- name: setCode
type: string
description: The set code of the MTGJSON Identifier
- name: abuId
type: string
description: The Abu Games ID
- name: cardKingdomEtchedId
type: string
description: The Card Kingdom Etched ID
- name: cardKingdomFoilId
type: string
description: The Card Kingdom Foil ID
- name: cardKingdomId
type: string
description: The Card Kingdom ID
- name: cardsphereId
type: string
description: The Cardsphere ID
- name: cardsphereFoilId
type: string
description: The Cardsphere Foil ID
- name: cardtraderId
type: string
description: The Cardtrader ID
- name: csiId
type: string
description: The cool stuff inc ID
- name: mcmId
type: string
description: The cardmarket ID
- name: mcmMetaId
type: string
description: The cardmarket meta ID
- name: miniaturemarketId
type: string
description: The miniaturemarket ID
- name: mtgArenaId
type: string
description: The mtg arena ID
- name: mtgjsonFoilVersionId
type: string
description: The uuid generated by mtgjson for the foil version of a card
- name: mtgjsonNonFoilVersionId
type: string
description: The uuid generated by mtgjson for the non-foil version of a card
- name: mtgjsonV4Id
type: string
description: The uuid generated by mtgjson a card
- name: mtgoFoilId
type: string
description: The mtgo foil ID
- name: mtgoId
type: string
description: The mtgo ID
- name: multiverseId
type: string
description: The multiverse ID used by wotc for gatherer
- name: scgId
type: string
description: The starcitygames ID
- name: scryfallId
type: string
description: The scryfall ID
- name: scryfallCardBackId
type: string
description: The scryfall card back ID
- name: scryfallOracleId
type: string
description: The scryfall oracle ID
- name: scryfallIllustrationId
type: string
description: The scryfall illustration ID
- name: tcgplayerProductId
type: string
description: The tcgplayer product ID
- name: tcgplayerEtchedProductId
type: string
description: The tcgplayer etched product ID
- name: tntId
type: string
description: The troll and toad ID
- name: tcgcsv_categories
strategy: drop_and_recreate
columns:
- name: id
type: integer
description: internal database id
primary_key: true
autoincrement: true
- name: categoryId
type: integer
- name: name
type: string
- name: modifiedOn
type: string
- name: displayName
type: string
- name: seoCategoryName
type: string
- name: categoryDescription
type: string
- name: categoryPageTitle
type: string
- name: sealedLabel
type: string
- name: nonSealedLabel
type: string
- name: conditionGuideUrl
type: string
- name: isScannable
type: boolean
- name: popularity
type: integer
- name: isDirect
type: boolean
- name: tcgcsv_groups
strategy: drop_and_recreate
columns:
- name: id
type: integer
primary_key: true
autoincrement: true
- name: groupId
type: integer
- name: name
type: string
- name: abbreviation
type: string
- name: isSupplemental
type: boolean
- name: publishedOn
type: string
- name: modifiedOn
type: string
- name: categoryId
type: integer
is_secret: false

View File

@@ -0,0 +1,89 @@
import os
import wmill
import yaml
from sqlalchemy import create_engine, text, MetaData, Table, Column, Integer, String, inspect
from sqlalchemy.engine import Engine
import psycopg2
# You can import any PyPi package.
# See here for more info: https://www.windmill.dev/docs/advanced/dependencies_in_python
# you can use typed resources by doing a type alias to dict
#postgresql = dict
DB_RESOURCE_PATH = 'u/joshuakrzemien/slick_postgresql'
DB_CONFIG_PATH = 'f/CCR_ETL/ccr_db_config'
def create_db_engine(db: dict):
db_url = f"postgresql+psycopg2://postgres:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}"
engine = create_engine(db_url)
engine.connect()
return engine
def table_exists(engine: Engine, table_name: str) -> bool:
"""Check if a table exists in the database."""
inspector = inspect(engine)
return table_name in inspector.get_table_names()
def create_table(engine: Engine, table: dict, strategy: str = "create_if_not_exists"):
try:
table_name = table['name']
columns = table['columns']
# Handle different table strategies
if strategy == "drop_and_recreate":
if table_exists(engine, table_name):
print(f"Dropping existing table: {table_name}")
with engine.connect() as conn:
conn.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE"))
conn.commit()
elif strategy == "create_if_not_exists":
if table_exists(engine, table_name):
print(f"Table {table_name} already exists, skipping creation")
return
else:
raise ValueError(f"Unknown table strategy: {strategy}")
# Map config types to SQLAlchemy types
type_mapping = {
'integer': Integer,
'string': String
}
# Build SQLAlchemy columns
sqlalchemy_columns = []
for column in columns:
col_type = type_mapping.get(column['type'], String)
sqlalchemy_columns.append(Column(column['name'], col_type, primary_key=column.get('primary_key', False), nullable=column.get('nullable', True), index=column.get('index', False), autoincrement=column.get('autoincrement', False)))
# Create table using SQLAlchemy Core
metadata = MetaData()
new_table = Table(table_name, metadata, *sqlalchemy_columns)
# Create the table
metadata.create_all(engine)
print(f"Successfully created table: {table_name}")
except Exception as e:
print(f"Error creating table {table_name}: {str(e)}")
raise
def main():
db = wmill.client.get_resource(DB_RESOURCE_PATH)
config_yaml = wmill.get_variable(DB_CONFIG_PATH)
config = yaml.safe_load(config_yaml)
engine = create_db_engine(db)
# Get table strategy from config (default to drop_and_recreate)
table_strategy = config.get('table_strategy', 'drop_and_recreate')
print(f"Using table strategy: {table_strategy}")
for table in config['schema']['tables']:
# Allow per-table strategy override
table_specific_strategy = table.get('strategy', table_strategy)
create_table(engine, table, table_specific_strategy)
return {"status": "success"}

View File

@@ -0,0 +1,14 @@
# py: 3.11
anyio==4.10.0
certifi==2025.8.3
greenlet==3.2.4
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
psycopg2-binary==2.9.10
pyyaml==6.0.2
sniffio==1.3.1
sqlalchemy==2.0.43
typing-extensions==4.15.0
wmill==1.538.0

View File

@@ -0,0 +1,9 @@
summary: ''
description: ''
lock: '!inline f/CCR_ETL/ccr_etl_db_init.script.lock'
kind: script
schema:
$schema: 'https://json-schema.org/draft/2020-12/schema'
type: object
properties: {}
required: []

View File

@@ -0,0 +1,377 @@
"""
CCR ETL MTGJSON Processing Script
This script handles the extraction, transformation, and loading of MTGJSON data
into a PostgreSQL database. It supports downloading, unzipping, preprocessing,
and batch inserting of various data formats.
"""
import json
import os
import yaml
from typing import Union
from zipfile import ZipFile
import psycopg2
import requests
import wmill
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
# Configuration paths
DB_RESOURCE_PATH = 'u/joshuakrzemien/slick_postgresql'
DB_CONFIG_PATH = 'f/CCR_ETL/ccr_db_config'
EXTRACT_CONFIG_PATH = 'f/CCR_ETL/ccr_extract_config'
DOWNLOAD_CONFIG_PATH = './shared/'
# Default processing parameters
DEFAULT_BATCH_SIZE = 1000
def validate_response_headers(response: requests.Response, outer_file_type: str) -> None:
"""Validate that the response content type matches the expected file type."""
if response.headers['Content-Type'] != f'application/{outer_file_type}':
raise ValueError(f"Expected {outer_file_type} response, got {response.headers['Content-Type']}")
def download_request(url: str, filename: str, outer_file_type: str) -> bytes:
"""Download a file from the given URL and validate its content type."""
print(f"🔽 Downloading {filename} from {url}")
response = requests.get(url)
response.raise_for_status()
validate_response_headers(response, outer_file_type)
print(f"✅ Download successful ({response.headers.get('Content-Length', 'Unknown')} bytes)")
return response.content
def generate_download_queue(url: str, filename: str, outer_file_type: str, iterables: dict) -> list:
"""
Generate a queue of download items based on URL templates and iterable values.
Example:
url = "https://tcgcsv.com/tcgplayer/{game_id}/groups"
iterables = {'game_id': [1,3,65,71,86]}
"""
queue = []
for key, value in iterables.items():
for item in value:
queue_item = {
'url': url.format(key=key, value=item),
'filename': filename.format(key=key, value=item),
'outer_file_type': outer_file_type,
}
queue.append(queue_item)
return queue
def save_file(content: bytes, filename: str) -> None:
"""Save binary content to a file in the download directory."""
filepath = DOWNLOAD_CONFIG_PATH + filename
with open(filepath, 'wb') as f:
f.write(content)
print(f"💾 Saved {len(content)} bytes to {filename}")
def unzip_file(filename: str) -> str:
"""Extract a zip file and return the name of the extracted content."""
new_filename = filename.replace('.zip', '')
zip_path = DOWNLOAD_CONFIG_PATH + filename
with ZipFile(zip_path, 'r') as zip_ref:
file_list = zip_ref.namelist()
print(f"📦 Extracting {len(file_list)} files from {filename}")
zip_ref.extractall(DOWNLOAD_CONFIG_PATH)
return new_filename
def load_file(filename: str, file_type: str) -> Union[dict, list]:
"""Load and parse a file from the download directory."""
filepath = DOWNLOAD_CONFIG_PATH + filename
if file_type == 'json':
with open(filepath, 'r') as f:
data = json.load(f)
print(f"📖 Loaded {file_type} file: {filename}")
return data
else:
raise ValueError(f"Unsupported file type: {file_type}")
def build_record_from_config(source_data: dict, expected_columns: list, additional_data: dict = None) -> dict:
"""
Build a record using the structure defined in the extract config.
Args:
source_data: The source data dictionary
expected_columns: List of column definitions from config
additional_data: Optional additional data to merge (e.g., parent UUID)
Returns:
Dictionary representing a single database record
"""
if additional_data is None:
additional_data = {}
# Merge source data with additional data (like uuid from parent structure)
combined_data = {**source_data, **additional_data}
record = {}
for column in expected_columns:
col_name = column['name']
# Skip auto-increment columns (like 'id')
if column.get('auto_increment', False):
continue
# Get value from combined data, use empty string as default
record[col_name] = combined_data.get(col_name, '')
return record
def create_db_engine(db: dict) -> Engine:
"""Create and test a database engine connection."""
db_url = f"postgresql+psycopg2://postgres:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}"
engine = create_engine(db_url)
# Test connection
conn = engine.connect()
conn.close()
print(f"🔌 Connected to database: {db['host']}:{db['port']}/{db['dbname']}")
return engine
def get_db_engine() -> Engine:
"""Get a database engine using the configured resource."""
db = wmill.client.get_resource(DB_RESOURCE_PATH)
return create_db_engine(db)
def generic_preprocess(
data: Union[dict, list],
expected_columns: list,
config: dict
) -> list:
"""
Generic data preprocessing function that handles various data structures.
Args:
data: Source data (dict or list)
expected_columns: List of column definitions
config: Preprocessing configuration
Returns:
List of processed records
"""
# Step 1: Follow data path
data_path = config.get("data_path", [])
for key in data_path:
if not isinstance(data, dict):
raise ValueError(f"Expected dict while navigating path, got {type(data)} at key '{key}'")
data = data.get(key)
if data is None:
raise ValueError(f"Missing key '{key}' in data path: {data_path}")
# Step 2: Handle nested structure
nested = config.get("nested", False)
nested_key = config.get("nested_key", None)
id_key = config.get("id_key", None)
flatten = config.get("flatten", False)
records = []
if isinstance(data, dict):
items = data.items()
elif isinstance(data, list):
items = enumerate(data)
else:
raise ValueError(f"Unsupported data structure: {type(data)}")
for outer_key, outer_value in items:
if nested:
if not isinstance(outer_value, list):
continue
for inner_value in outer_value:
if id_key and not inner_value.get(id_key):
continue
additional_data = {nested_key: outer_key} if nested_key else {}
record = build_record_from_config(inner_value, expected_columns, additional_data)
records.append(record)
else:
if not isinstance(outer_value, dict):
continue
if id_key and not outer_value.get(id_key):
continue
if flatten:
nested_data = outer_value.get("identifiers", {})
combined = {**nested_data, "uuid": outer_value.get("uuid")}
record = build_record_from_config(combined, expected_columns)
else:
record = build_record_from_config(outer_value, expected_columns)
records.append(record)
print(f"🔄 Processed {len(records)} records")
return records
def control_batch(data: list, batch_size: int = DEFAULT_BATCH_SIZE):
"""Split data into batches for processing."""
for i in range(0, len(data), batch_size):
yield data[i:i+batch_size]
def insert_data_into_table_batch(records: list, table: str, engine: Engine, batch_size: int = DEFAULT_BATCH_SIZE) -> None:
"""Insert records into database table in batches."""
if not records:
print("⚠️ No records to insert, skipping database operation")
return
print(f"💾 Inserting {len(records)} records into {table} (batch size: {batch_size})")
# Get column names from first record
columns = list(records[0].keys())
column_names = ', '.join(f'"{col}"' for col in columns)
placeholders = ', '.join([f':{col}' for col in columns])
insert_sql = f"INSERT INTO {table} ({column_names}) VALUES ({placeholders})"
with engine.connect() as conn:
batch_count = 0
total_inserted = 0
for batch in control_batch(records, batch_size):
batch_count += 1
batch_size_actual = len(batch)
conn.execute(text(insert_sql), batch)
total_inserted += batch_size_actual
if batch_count % 10 == 0:
print(f"⏳ Inserted {total_inserted}/{len(records)} records...")
conn.commit()
print(f"✅ Inserted {total_inserted} records in {batch_count} batches")
def process_job(job: dict) -> dict:
"""
Process a single ETL job.
Args:
job: Job configuration dictionary
Returns:
Dictionary with job processing results
"""
# Extract job parameters
url = job.get('url')
filename = job.get('filename')
outer_file_type = job.get('outer_file_type')
inner_file_type = job.get('inner_file_type')
table = job.get('table')
expected_columns = job.get('expected_columns')
batch_size = job.get('batch_size', DEFAULT_BATCH_SIZE)
preprocess_function_name = job.get('preprocess_function', 'generic_preprocess')
preprocess_config = job.get('preprocess_config')
active = job.get('active')
iterables = job.get('iterables')
print(f"\n🚀 Processing job for table '{table}'")
if not active:
print(f"⚠️ Job is not active, skipping")
return {"status": "skipped"}
# Get preprocessing function
if isinstance(preprocess_function_name, str):
preprocess_function = globals().get(preprocess_function_name)
if not callable(preprocess_function):
raise ValueError(f"Preprocessing function '{preprocess_function_name}' not found or not callable.")
# Get database engine
engine = get_db_engine()
# Populate download queue
if iterables:
queue = generate_download_queue(url, filename, outer_file_type, iterables)
else:
queue = [{
'url': url,
'filename': filename,
'outer_file_type': outer_file_type,
'inner_file_type': inner_file_type,
'table': table,
'expected_columns': expected_columns
}]
# Process download queue
for queue_item in queue:
content = download_request(queue_item.get('url'), queue_item.get('filename'), queue_item.get('outer_file_type'))
save_file(content, queue_item.get('filename'))
# Handle file extraction if needed
saved_filename = filename
if outer_file_type == 'zip':
saved_filename = unzip_file(filename)
# Load and preprocess data
data = load_file(saved_filename, inner_file_type)
records = preprocess_function(data, expected_columns, preprocess_config)
# Insert data into database
insert_data_into_table_batch(records, table, engine, batch_size)
result = {
"status": "success",
"table": table,
"records_processed": len(records),
"filename": saved_filename
}
print(f"✅ Job complete: {len(records)} records processed for {table}")
return result
def main() -> dict:
"""
Main ETL processing function.
Returns:
Dictionary with overall processing results
"""
print("🎯 ETL Process Starting")
print("=" * 50)
# Load configuration
config_yaml = wmill.get_variable(EXTRACT_CONFIG_PATH)
config = yaml.safe_load(config_yaml)
print(f"📋 Processing {len(config['jobs'])} jobs")
results = []
successful_jobs = 0
failed_jobs = 0
for i, job in enumerate(config['jobs'], 1):
print(f"\n--- Job {i}/{len(config['jobs'])} ---")
try:
result = process_job(job)
results.append(result)
successful_jobs += 1
except Exception as e:
error_result = {
"status": "error",
"table": job.get('table', 'unknown'),
"error": str(e),
"filename": job.get('filename', 'unknown')
}
results.append(error_result)
failed_jobs += 1
print(f"❌ Job {i} failed: {str(e)}")
print(f"\n🏁 ETL Process Complete")
print(f"✅ Successful: {successful_jobs} | ❌ Failed: {failed_jobs} | 📋 Total: {len(results)}")
return {
"status": "completed",
"jobs_processed": len(results),
"successful_jobs": successful_jobs,
"failed_jobs": failed_jobs,
"results": results
}

View File

@@ -0,0 +1,17 @@
# py: 3.11
anyio==4.10.0
certifi==2025.8.3
charset-normalizer==3.4.3
greenlet==3.2.4
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
psycopg2-binary==2.9.10
pyyaml==6.0.2
requests==2.32.5
sniffio==1.3.1
sqlalchemy==2.0.43
typing-extensions==4.15.0
urllib3==2.5.0
wmill==1.539.1

View File

@@ -0,0 +1,9 @@
summary: CCR ETL MTGJSON
description: ''
lock: '!inline f/CCR_ETL/ccr_etl_mtgjson.script.lock'
kind: script
schema:
$schema: 'https://json-schema.org/draft/2020-12/schema'
type: object
properties: {}
required: []

View File

@@ -0,0 +1,162 @@
description: ''
value: |-
jobs:
- name: mtgjson_skus
active: true
url: https://mtgjson.com/api/v5/TcgplayerSkus.json.zip
filename: TcgplayerSkus.json.zip
outer_file_type: zip
inner_file_type: json
preprocess_config:
data_path: ["data"]
nested: true
nested_key: "uuid"
id_key: "skuId"
table: mtgjson_skus
batch_size: 1000
expected_columns:
- name: uuid
type: string
- name: condition
type: string
- name: language
type: string
- name: printing
type: string
- name: finish
type: string
- name: productId
type: string
- name: skuId
type: string
cache:
status: true
ttl: 86400
- name: mtgjson_identifiers
active: true
url: https://mtgjson.com/api/v5/AllIdentifiers.json.zip
filename: AllIdentifiers.json.zip
outer_file_type: zip
inner_file_type: json
preprocess_config:
data_path: ["data"]
nested: false
flatten: true
id_key: "uuid"
table: mtgjson_identifiers
batch_size: 1000
expected_columns:
- name: uuid
type: string
- name: name
type: string
- name: setCode
type: string
- name: abuId
type: string
- name: cardKingdomEtchedId
type: string
- name: cardKingdomFoilId
type: string
- name: cardKingdomId
type: string
- name: cardsphereId
type: string
- name: cardsphereFoilId
type: string
- name: cardtraderId
type: string
- name: csiId
type: string
- name: mcmId
type: string
- name: mcmMetaId
type: string
- name: miniaturemarketId
type: string
- name: mtgArenaId
type: string
- name: mtgjsonFoilVersionId
type: string
- name: mtgjsonNonFoilVersionId
type: string
- name: mtgjsonV4Id
type: string
- name: mtgoFoilId
type: string
- name: mtgoId
type: string
- name: multiverseId
type: string
- name: scgId
type: string
- name: scryfallId
type: string
- name: scryfallCardBackId
type: string
- name: scryfallOracleId
type: string
- name: scryfallIllustrationId
type: string
- name: tcgplayerProductId
type: string
- name: tcgplayerEtchedProductId
type: string
- name: tntId
type: string
cache:
status: true
ttl: 86400
- name: tcgcsv_categories
active: true
url: https://tcgcsv.com/tcgplayer/categories
outer_file_type: json
preprocess_config:
data_path: ["results"]
nested: false
filename: tcgplayer_categories.json
expected_columns:
- name: categoryId
type: integer
- name: name
type: string
- name: modifiedOn
type: string
- name: displayName
type: string
- name: seoCategoryName
type: string
- name: categoryDescription
type: string
- name: categoryPageTitle
type: string
- name: sealedLabel
type: string
- name: nonSealedLabel
type: string
- name: conditionGuideUrl
type: string
- name: isScannable
type: boolean
- name: popularity
type: integer
- name: isDirect
type: boolean
- name: tcgcsv_groups
active: true
url: https://tcgcsv.com/tcgplayer/{game_id}/groups
outer_file_type: json
preprocess_config:
data_path: ["results"]
nested: false
filename: tcgplayer_{game_id}_groups.json
expected_columns:
- name: groupId
type: integer
- name: name
type: string
- name: modifiedOn
type: string
iterables:
game_id: [1,3,65,71,86]
is_secret: false

View File

@@ -0,0 +1,6 @@
summary: null
display_name: CCR_ETL
extra_perms:
u/joshuakrzemien: true
owners:
- u/joshuakrzemien

View File

@@ -1,5 +1,10 @@
version: v2
locks:
f/CCR_ETL/ccr_etl_db_config+672b195893fa2357771ffa3cedc08fd5c8a89b2e831453c694fa2e6491f3b13b: 672b195893fa2357771ffa3cedc08fd5c8a89b2e831453c694fa2e6491f3b13b
f/CCR_ETL/ccr_etl_db_config+92fd8458ec5f2ae9db765da90a8b5f68086769e62767933a095ce03f154e2863: 92fd8458ec5f2ae9db765da90a8b5f68086769e62767933a095ce03f154e2863
f/CCR_ETL/ccr_etl_db_init+55def58b140529028aec966f2f8ff7a98380842e3edd0cca985bad0e18dd3533: 55def58b140529028aec966f2f8ff7a98380842e3edd0cca985bad0e18dd3533
f/CCR_ETL/ccr_etl_mtgjson+7b4ff1f872736a4935a108283f1cebac0a8ef84f174eeccd797652eb1e9f004b: 7b4ff1f872736a4935a108283f1cebac0a8ef84f174eeccd797652eb1e9f004b
f/CCR_ETL/ccr_etl_mtgjson+c18f5bd64c559e4765379b2e0bcbfd06df1b8becf3a47bfd6707017878ffc610: c18f5bd64c559e4765379b2e0bcbfd06df1b8becf3a47bfd6707017878ffc610
u/joshuakrzemien/db_test+972c08a79a04b0017b517504a61b5d5069e82b199171e0569f281508306d8c46: 972c08a79a04b0017b517504a61b5d5069e82b199171e0569f281508306d8c46
u/joshuakrzemien/mtg_json_download+f31539309b4d4b4ef529e47352f676f64e58e2a1b2797c2a630acab8a7c40260: f31539309b4d4b4ef529e47352f676f64e58e2a1b2797c2a630acab8a7c40260
u/joshuakrzemien/mtgjson_unzip+0bdcf659018721653979c2de3bfd9ce2f70cf0e76ef6993226ee76e9d6c73dc4: 0bdcf659018721653979c2de3bfd9ce2f70cf0e76ef6993226ee76e9d6c73dc4

24
wmill.yaml Normal file
View File

@@ -0,0 +1,24 @@
defaultTs: bun
includes:
- f/**
excludes: []
codebases: []
skipVariables: false
skipResources: false
skipResourceTypes: false
skipSecrets: true
skipScripts: false
skipFlows: false
skipApps: false
skipFolders: false
includeSchedules: false
includeTriggers: false
includeUsers: false
includeGroups: false
includeSettings: false
includeKey: false
gitBranches:
master:
overrides: {}
baseUrl: 'http://192.168.1.41:8009/'
workspaceId: ccr-cards