mirror of
https://github.com/onepanelio/onepanel.git
synced 2025-09-26 17:51:13 +08:00
Merge pull request #860 from onepanelio/feat/update-templates
feat: Update all templates for dynamic node labels and new file syncing
This commit is contained in:
@@ -86,7 +86,7 @@ templates:
|
||||
optional: true
|
||||
`
|
||||
|
||||
const pytorchMnistWorkflowTemplateName = "PyTorch Training"
|
||||
const pytorchWorkflowTemplateName = "PyTorch Training"
|
||||
|
||||
func initialize20200605090509() {
|
||||
if _, ok := initializedMigrations[20200605090509]; !ok {
|
||||
@@ -120,7 +120,7 @@ func Up20200605090509(tx *sql.Tx) error {
|
||||
}
|
||||
|
||||
workflowTemplate := &v1.WorkflowTemplate{
|
||||
Name: pytorchMnistWorkflowTemplateName,
|
||||
Name: pytorchWorkflowTemplateName,
|
||||
Manifest: pytorchMnistWorkflowTemplate,
|
||||
}
|
||||
|
||||
@@ -150,7 +150,7 @@ func Down20200605090509(tx *sql.Tx) error {
|
||||
return err
|
||||
}
|
||||
|
||||
uid, err := uid2.GenerateUID(pytorchMnistWorkflowTemplateName, 30)
|
||||
uid, err := uid2.GenerateUID(pytorchWorkflowTemplateName, 30)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@@ -17,7 +17,7 @@ func initialize20201221194344() {
|
||||
func Up20201221194344(tx *sql.Tx) error {
|
||||
return updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "pytorch-mnist-training", "20201221194344.yaml"),
|
||||
pytorchMnistWorkflowTemplateName,
|
||||
pytorchWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
},
|
||||
@@ -29,7 +29,7 @@ func Down20201221194344(tx *sql.Tx) error {
|
||||
// This code is executed when the migration is rolled back.
|
||||
return updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "pytorch-mnist-training", "20200605090509.yaml"),
|
||||
pytorchMnistWorkflowTemplateName,
|
||||
pytorchWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
},
|
||||
|
129
db/go/20210118175809_update_node_pool_label.go
Normal file
129
db/go/20210118175809_update_node_pool_label.go
Normal file
@@ -0,0 +1,129 @@
|
||||
package migration
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"github.com/pressly/goose"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
func initialize20210118175809() {
|
||||
if _, ok := initializedMigrations[20210118175809]; !ok {
|
||||
goose.AddMigration(Up20210118175809, Down20210118175809)
|
||||
initializedMigrations[20210118175809] = true
|
||||
}
|
||||
}
|
||||
|
||||
// Up20210118175809 updates workflows so that the nodePoolSelector label is based on k8s config
|
||||
func Up20210118175809(tx *sql.Tx) error {
|
||||
// This code is executed when the migration is applied.
|
||||
if err := updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "hyperparameter-tuning", "20210118175809.yaml"),
|
||||
hyperparameterTuningTemplateName,
|
||||
map[string]string{
|
||||
"framework": "tensorflow",
|
||||
"tuner": "TPE",
|
||||
"created-by": "system",
|
||||
},
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "maskrcnn-training", "20210118175809.yaml"),
|
||||
maskRCNNWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
"used-by": "cvat",
|
||||
},
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "pytorch-mnist-training", "20210118175809.yaml"),
|
||||
pytorchWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
"framework": "pytorch",
|
||||
},
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "tensorflow-mnist-training", "20210118175809.yaml"),
|
||||
tensorflowWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
"framework": "tensorflow",
|
||||
},
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "tf-object-detection-training", "20210118175809.yaml"),
|
||||
tensorflowObjectDetectionWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
"used-by": "cvat",
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
// Down20210118175809 reverts the migration
|
||||
func Down20210118175809(tx *sql.Tx) error {
|
||||
// This code is executed when the migration is rolled back.
|
||||
if err := updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "tf-object-detection-training", "20201223202929.yaml"),
|
||||
tensorflowObjectDetectionWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
"used-by": "cvat",
|
||||
},
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "tensorflow-mnist-training", "20201223062947.yaml"),
|
||||
tensorflowWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
"framework": "tensorflow",
|
||||
},
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "pytorch-mnist-training", "20201221194344.yaml"),
|
||||
pytorchWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
},
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "maskrcnn-training", "20201221195937.yaml"),
|
||||
maskRCNNWorkflowTemplateName,
|
||||
map[string]string{
|
||||
"created-by": "system",
|
||||
"used-by": "cvat",
|
||||
},
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return updateWorkflowTemplateManifest(
|
||||
filepath.Join("workflows", "hyperparameter-tuning", "20201225172926.yaml"),
|
||||
hyperparameterTuningTemplateName,
|
||||
map[string]string{
|
||||
"framework": "tensorflow",
|
||||
"tuner": "TPE",
|
||||
"created-by": "system",
|
||||
},
|
||||
)
|
||||
}
|
30
db/go/20210129094725_update_cvat_workspace.go
Normal file
30
db/go/20210129094725_update_cvat_workspace.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package migration
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"github.com/pressly/goose"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
func initialize20210129134326() {
|
||||
if _, ok := initializedMigrations[20210129134326]; !ok {
|
||||
goose.AddMigration(Up20210129134326, Down20210129134326)
|
||||
initializedMigrations[20210129134326] = true
|
||||
}
|
||||
}
|
||||
|
||||
//Up20210129134326 updates CVAT to latest image
|
||||
func Up20210129134326(tx *sql.Tx) error {
|
||||
// This code is executed when the migration is applied.
|
||||
return updateWorkspaceTemplateManifest(
|
||||
filepath.Join("workspaces", "cvat", "20210129134326.yaml"),
|
||||
cvatTemplateName)
|
||||
}
|
||||
|
||||
//Down20210129134326 reverts to previous CVAT image
|
||||
func Down20210129134326(tx *sql.Tx) error {
|
||||
// This code is executed when the migration is rolled back.
|
||||
return updateWorkspaceTemplateManifest(
|
||||
filepath.Join("workspaces", "cvat", "20210107094725.yaml"),
|
||||
cvatTemplateName)
|
||||
}
|
30
db/go/20210129142057_update_jupyter_workspace.go
Normal file
30
db/go/20210129142057_update_jupyter_workspace.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package migration
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"github.com/pressly/goose"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
func initialize20210129142057() {
|
||||
if _, ok := initializedMigrations[20210129142057]; !ok {
|
||||
goose.AddMigration(Up20210129142057, Down20210129142057)
|
||||
initializedMigrations[20210129142057] = true
|
||||
}
|
||||
}
|
||||
|
||||
// Up20210129142057 updates the jupyterlab workspace template
|
||||
func Up20210129142057(tx *sql.Tx) error {
|
||||
// This code is executed when the migration is applied.
|
||||
return updateWorkspaceTemplateManifest(
|
||||
filepath.Join("workspaces", "jupyterlab", "20210129142057.yaml"),
|
||||
jupyterLabTemplateName)
|
||||
}
|
||||
|
||||
// Down20210129142057 rolls back the jupyterab workspace template update
|
||||
func Down20210129142057(tx *sql.Tx) error {
|
||||
// This code is executed when the migration is rolled back.
|
||||
return updateWorkspaceTemplateManifest(
|
||||
filepath.Join("workspaces", "jupyterlab", "20201229205644.yaml"),
|
||||
jupyterLabTemplateName)
|
||||
}
|
33
db/go/20210129152427_update_vscode_template.go
Normal file
33
db/go/20210129152427_update_vscode_template.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package migration
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"github.com/pressly/goose"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
func initialize20210129152427() {
|
||||
if _, ok := initializedMigrations[20210129152427]; !ok {
|
||||
goose.AddMigration(Up20210129152427, Down20210129152427)
|
||||
initializedMigrations[20210129152427] = true
|
||||
}
|
||||
}
|
||||
|
||||
// Up20210129152427 migration will add lifecycle hooks to VSCode template.
|
||||
// These hooks will attempt to export the conda, pip, and vscode packages that are installed,
|
||||
// to a text file.
|
||||
// On workspace resume / start, the code then tries to install these packages.
|
||||
func Up20210129152427(tx *sql.Tx) error {
|
||||
// This code is executed when the migration is applied.
|
||||
return updateWorkspaceTemplateManifest(
|
||||
filepath.Join("workspaces", "vscode", "20210129152427.yaml"),
|
||||
vscodeWorkspaceTemplateName)
|
||||
}
|
||||
|
||||
// Down20210129152427 removes the lifecycle hooks from VSCode workspace template.
|
||||
func Down20210129152427(tx *sql.Tx) error {
|
||||
// This code is executed when the migration is rolled back.
|
||||
return updateWorkspaceTemplateManifest(
|
||||
filepath.Join("workspaces", "vscode", "20201028145443.yaml"),
|
||||
vscodeWorkspaceTemplateName)
|
||||
}
|
@@ -86,6 +86,10 @@ func Initialize() {
|
||||
initialize20201225172926()
|
||||
initialize20201229205644()
|
||||
initialize20210107094725()
|
||||
initialize20210118175809()
|
||||
initialize20210129134326()
|
||||
initialize20210129142057()
|
||||
initialize20210129152427()
|
||||
|
||||
if err := client.DB.Close(); err != nil {
|
||||
log.Printf("[error] closing db %v", err)
|
||||
|
189
db/yaml/workflows/hyperparameter-tuning/20210118175809.yaml
Normal file
189
db/yaml/workflows/hyperparameter-tuning/20210118175809.yaml
Normal file
@@ -0,0 +1,189 @@
|
||||
# source: https://github.com/onepanelio/templates/blob/master/workflows/nni-hyperparameter-tuning/mnist/
|
||||
# Workflow Template example for hyperparameter tuning
|
||||
# Documentation: https://docs.onepanel.ai/docs/reference/workflows/hyperparameter-tuning
|
||||
#
|
||||
# Only change the fields marked with [CHANGE]
|
||||
entrypoint: main
|
||||
arguments:
|
||||
parameters:
|
||||
|
||||
# [CHANGE] Path to your training/model architecture code repository
|
||||
# Change this value and revision value to your code repository and branch respectively
|
||||
- name: source
|
||||
value: https://github.com/onepanelio/templates
|
||||
|
||||
# [CHANGE] Revision is the branch or tag that you want to use
|
||||
# You can change this to any tag or branch name in your repository
|
||||
- name: revision
|
||||
value: v0.18.0
|
||||
|
||||
# [CHANGE] Default configuration for the NNI tuner
|
||||
# See https://docs.onepanel.ai/docs/reference/workflows/hyperparameter-tuning#understanding-the-configurations
|
||||
- name: config
|
||||
displayName: Configuration
|
||||
required: true
|
||||
hint: NNI configuration
|
||||
type: textarea.textarea
|
||||
value: |-
|
||||
authorName: Onepanel, Inc.
|
||||
experimentName: MNIST TF v2.x
|
||||
trialConcurrency: 1
|
||||
maxExecDuration: 1h
|
||||
maxTrialNum: 10
|
||||
trainingServicePlatform: local
|
||||
searchSpacePath: search_space.json
|
||||
useAnnotation: false
|
||||
tuner:
|
||||
# gpuIndices: '0' # uncomment and update to the GPU indices to assign this tuner
|
||||
builtinTunerName: TPE # choices: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
|
||||
classArgs:
|
||||
optimize_mode: maximize # choices: maximize, minimize
|
||||
trial:
|
||||
command: python main.py --output /mnt/output
|
||||
codeDir: .
|
||||
# gpuNum: 1 # uncomment and update to number of GPUs
|
||||
|
||||
# [CHANGE] Search space configuration
|
||||
# Change according to your hyperparameters and ranges
|
||||
- name: search-space
|
||||
displayName: Search space configuration
|
||||
required: true
|
||||
type: textarea.textarea
|
||||
value: |-
|
||||
{
|
||||
"dropout_rate": { "_type": "uniform", "_value": [0.5, 0.9] },
|
||||
"conv_size": { "_type": "choice", "_value": [2, 3, 5, 7] },
|
||||
"hidden_size": { "_type": "choice", "_value": [124, 512, 1024] },
|
||||
"batch_size": { "_type": "choice", "_value": [16, 32] },
|
||||
"learning_rate": { "_type": "choice", "_value": [0.0001, 0.001, 0.01, 0.1] },
|
||||
"epochs": { "_type": "choice", "_value": [10] }
|
||||
}
|
||||
|
||||
# Node pool dropdown (Node group in EKS)
|
||||
# You can add more of these if you have additional tasks that can run on different node pools
|
||||
- displayName: Node pool
|
||||
hint: Name of node pool or group to run this workflow task
|
||||
type: select.nodepool
|
||||
name: sys-node-pool
|
||||
value: {{.DefaultNodePoolOption}}
|
||||
required: true
|
||||
|
||||
templates:
|
||||
- name: main
|
||||
dag:
|
||||
tasks:
|
||||
- name: hyperparameter-tuning
|
||||
template: hyperparameter-tuning
|
||||
- name: metrics-writer
|
||||
template: metrics-writer
|
||||
dependencies: [hyperparameter-tuning]
|
||||
arguments:
|
||||
# Use sys-metrics artifact output from hyperparameter-tuning Task
|
||||
# This writes the best metrics to the Workflow
|
||||
artifacts:
|
||||
- name: sys-metrics
|
||||
from: "{{tasks.hyperparameter-tuning.outputs.artifacts.sys-metrics}}"
|
||||
- name: hyperparameter-tuning
|
||||
inputs:
|
||||
artifacts:
|
||||
- name: src
|
||||
# Clone the above repository into '/mnt/data/src'
|
||||
# See https://docs.onepanel.ai/docs/reference/workflows/artifacts#git for private repositories
|
||||
git:
|
||||
repo: '{{workflow.parameters.source}}'
|
||||
revision: '{{workflow.parameters.revision}}'
|
||||
path: /mnt/data/src
|
||||
# [CHANGE] Path where config.yaml will be generated or already exists
|
||||
# Update the path below so that config.yaml is written to the same directory as your main.py file
|
||||
# Note that your source code is cloned to /mnt/data/src
|
||||
- name: config
|
||||
path: /mnt/data/src/workflows/hyperparameter-tuning/mnist/config.yaml
|
||||
raw:
|
||||
data: '{{workflow.parameters.config}}'
|
||||
# [CHANGE] Path where search_space.json will be generated or already exists
|
||||
# Update the path below so that search_space.json is written to the same directory as your main.py file
|
||||
# Note that your source code is cloned to /mnt/data/src
|
||||
- name: search-space
|
||||
path: /mnt/data/src/workflows/hyperparameter-tuning/mnist/search_space.json
|
||||
raw:
|
||||
data: '{{workflow.parameters.search-space}}'
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: output
|
||||
path: /mnt/output
|
||||
optional: true
|
||||
container:
|
||||
image: onepanel/dl:0.17.0
|
||||
args:
|
||||
- --config
|
||||
# [CHANGE] Update the path below to point to config.yaml path as described above
|
||||
- /mnt/data/src/workflows/hyperparameter-tuning/mnist/config.yaml
|
||||
workingDir: /mnt
|
||||
volumeMounts:
|
||||
- name: hyperparamtuning-data
|
||||
mountPath: /mnt/data
|
||||
- name: hyperparamtuning-output
|
||||
mountPath: /mnt/output
|
||||
nodeSelector:
|
||||
{{.NodePoolLabel}}: '{{workflow.parameters.sys-node-pool}}'
|
||||
sidecars:
|
||||
- name: nni-web-ui
|
||||
image: onepanel/nni-web-ui:0.17.0
|
||||
env:
|
||||
- name: ONEPANEL_INTERACTIVE_SIDECAR
|
||||
value: 'true'
|
||||
ports:
|
||||
- containerPort: 9000
|
||||
name: nni
|
||||
- name: tensorboard
|
||||
image: onepanel/dl:0.17.0
|
||||
command:
|
||||
- sh
|
||||
- '-c'
|
||||
env:
|
||||
- name: ONEPANEL_INTERACTIVE_SIDECAR
|
||||
value: 'true'
|
||||
args:
|
||||
# Read logs from /mnt/output/tensorboard - /mnt/output is auto-mounted from volumeMounts
|
||||
- tensorboard --logdir /mnt/output/tensorboard
|
||||
ports:
|
||||
- containerPort: 6006
|
||||
name: tensorboard
|
||||
# Use the metrics-writer tasks to write best metrics to Workflow
|
||||
- name: metrics-writer
|
||||
inputs:
|
||||
artifacts:
|
||||
- name: sys-metrics
|
||||
path: /tmp/sys-metrics.json
|
||||
- git:
|
||||
repo: https://github.com/onepanelio/templates.git
|
||||
revision: v0.18.0
|
||||
name: src
|
||||
path: /mnt/src
|
||||
container:
|
||||
image: onepanel/python-sdk:v0.16.0
|
||||
command:
|
||||
- python
|
||||
- -u
|
||||
args:
|
||||
- /mnt/src/tasks/metrics-writer/main.py
|
||||
- --from_file=/tmp/sys-metrics.json
|
||||
|
||||
# [CHANGE] Volumes that will mount to /mnt/data (annotated data) and /mnt/output (models, checkpoints, logs)
|
||||
# Update this depending on your annotation data, model, checkpoint, logs, etc. sizes
|
||||
# Example values: 250Mi, 500Gi, 1Ti
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: hyperparamtuning-data
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
- metadata:
|
||||
name: hyperparamtuning-output
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
208
db/yaml/workflows/maskrcnn-training/20210118175809.yaml
Normal file
208
db/yaml/workflows/maskrcnn-training/20210118175809.yaml
Normal file
@@ -0,0 +1,208 @@
|
||||
# source: https://github.com/onepanelio/templates/blob/master/workflows/maskrcnn-training/
|
||||
arguments:
|
||||
parameters:
|
||||
- name: cvat-annotation-path
|
||||
value: 'artifacts/{{workflow.namespace}}/annotations/'
|
||||
hint: Path to annotated data (COCO format) in default object storage. In CVAT, this parameter will be pre-populated.
|
||||
displayName: Dataset path
|
||||
visibility: internal
|
||||
|
||||
- name: val-split
|
||||
value: 10
|
||||
displayName: Validation split size
|
||||
type: input.number
|
||||
visibility: public
|
||||
hint: Enter validation set size in percentage of full dataset. (0 - 100)
|
||||
|
||||
- name: num-augmentation-cycles
|
||||
value: 1
|
||||
displayName: Number of augmentation cycles
|
||||
type: input.number
|
||||
visibility: public
|
||||
hint: Number of augmentation cycles, zero means no data augmentation
|
||||
|
||||
- name: preprocessing-parameters
|
||||
value: |-
|
||||
RandomBrightnessContrast:
|
||||
p: 0.2
|
||||
GaussianBlur:
|
||||
p: 0.3
|
||||
GaussNoise:
|
||||
p: 0.4
|
||||
HorizontalFlip:
|
||||
p: 0.5
|
||||
VerticalFlip:
|
||||
p: 0.3
|
||||
displayName: Preprocessing parameters
|
||||
visibility: public
|
||||
type: textarea.textarea
|
||||
hint: 'See <a href="https://albumentations.ai/docs/api_reference/augmentations/transforms/" target="_blank">documentation</a> for more information on parameters.'
|
||||
|
||||
- name: cvat-num-classes
|
||||
displayName: Number of classes
|
||||
hint: Number of classes. In CVAT, this parameter will be pre-populated.
|
||||
value: '10'
|
||||
visibility: internal
|
||||
|
||||
- name: hyperparameters
|
||||
displayName: Hyperparameters
|
||||
visibility: public
|
||||
type: textarea.textarea
|
||||
value: |-
|
||||
stage-1-epochs: 1 # Epochs for network heads
|
||||
stage-2-epochs: 1 # Epochs for finetune layers
|
||||
stage-3-epochs: 1 # Epochs for all layers
|
||||
num_steps: 1000 # Num steps per epoch
|
||||
hint: "See <a href='https://docs.onepanel.ai/docs/reference/cvat/built-in-models#maskrcnn-hyperparameters' target='_blank'>documentation</a> for more information on parameters."
|
||||
|
||||
- name: dump-format
|
||||
value: cvat_coco
|
||||
displayName: CVAT dump format
|
||||
visibility: private
|
||||
|
||||
- name: cvat-finetune-checkpoint
|
||||
value: ''
|
||||
hint: Path to the last fine-tune checkpoint for this model in default object storage. Leave empty if this is the first time you're training this model.
|
||||
displayName: Checkpoint path
|
||||
visibility: public
|
||||
|
||||
- displayName: Node pool
|
||||
hint: Name of node pool or group to run this workflow task
|
||||
type: select.nodepool
|
||||
visibility: public
|
||||
name: sys-node-pool
|
||||
value: {{.DefaultNodePoolOption}}
|
||||
required: true
|
||||
|
||||
entrypoint: main
|
||||
templates:
|
||||
- dag:
|
||||
tasks:
|
||||
- name: preprocessing
|
||||
template: preprocessing
|
||||
- name: train-model
|
||||
template: tensorflow
|
||||
dependencies: [preprocessing]
|
||||
arguments:
|
||||
artifacts:
|
||||
- name: data
|
||||
from: "{{tasks.preprocessing.outputs.artifacts.processed-data}}"
|
||||
name: main
|
||||
- container:
|
||||
args:
|
||||
- |
|
||||
pip install pycocotools scikit-image==0.16.2 && \
|
||||
cd /mnt/src/train/workflows/maskrcnn-training && \
|
||||
python -u main.py train --dataset=/mnt/data/datasets/train_set/ \
|
||||
--model=workflow_maskrcnn \
|
||||
--extras="{{workflow.parameters.hyperparameters}}" \
|
||||
--ref_model_path="{{workflow.parameters.cvat-finetune-checkpoint}}" \
|
||||
--num_classes="{{workflow.parameters.cvat-num-classes}}" \
|
||||
--val_dataset=/mnt/data/datasets/eval_set/ \
|
||||
--use_validation=True
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
image: onepanel/dl:0.17.0
|
||||
volumeMounts:
|
||||
- mountPath: /mnt/data
|
||||
name: processed-data
|
||||
- mountPath: /mnt/output
|
||||
name: output
|
||||
workingDir: /mnt/src
|
||||
sidecars:
|
||||
- name: tensorboard
|
||||
image: onepanel/dl:0.17.0
|
||||
command: [ sh, -c ]
|
||||
env:
|
||||
- name: ONEPANEL_INTERACTIVE_SIDECAR
|
||||
value: 'true'
|
||||
args: [ "tensorboard --logdir /mnt/output/tensorboard" ]
|
||||
ports:
|
||||
- containerPort: 6006
|
||||
name: tensorboard
|
||||
nodeSelector:
|
||||
{{.NodePoolLabel}}: '{{workflow.parameters.sys-node-pool}}'
|
||||
inputs:
|
||||
artifacts:
|
||||
- name: data
|
||||
path: /mnt/data/datasets/
|
||||
- name: models
|
||||
path: /mnt/data/models/
|
||||
optional: true
|
||||
s3:
|
||||
key: '{{workflow.parameters.cvat-finetune-checkpoint}}'
|
||||
- git:
|
||||
repo: https://github.com/onepanelio/templates.git
|
||||
revision: v0.18.0
|
||||
name: src
|
||||
path: /mnt/src/train
|
||||
name: tensorflow
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: model
|
||||
optional: true
|
||||
path: /mnt/output
|
||||
- container:
|
||||
args:
|
||||
- |
|
||||
pip install pycocotools && \
|
||||
cd /mnt/src/preprocessing/workflows/albumentations-preprocessing && \
|
||||
python -u main.py \
|
||||
--data_aug_params="{{workflow.parameters.preprocessing-parameters}}" \
|
||||
--val_split={{workflow.parameters.val-split}} \
|
||||
--aug_steps={{workflow.parameters.num-augmentation-cycles}}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
image: onepanel/dl:0.17.0
|
||||
volumeMounts:
|
||||
- mountPath: /mnt/data
|
||||
name: data
|
||||
- mountPath: /mnt/output
|
||||
name: processed-data
|
||||
workingDir: /mnt/src
|
||||
nodeSelector:
|
||||
{{.NodePoolLabel}}: '{{workflow.parameters.sys-node-pool}}'
|
||||
inputs:
|
||||
artifacts:
|
||||
- name: data
|
||||
path: /mnt/data/datasets/
|
||||
s3:
|
||||
key: '{{workflow.parameters.cvat-annotation-path}}'
|
||||
- git:
|
||||
repo: https://github.com/onepanelio/templates.git
|
||||
revision: v0.18.0
|
||||
name: src
|
||||
path: /mnt/src/preprocessing
|
||||
name: preprocessing
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: processed-data
|
||||
optional: true
|
||||
path: /mnt/output
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 200Gi
|
||||
- metadata:
|
||||
name: processed-data
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 200Gi
|
||||
- metadata:
|
||||
name: output
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 200Gi
|
207
db/yaml/workflows/pytorch-mnist-training/20210118175809.yaml
Normal file
207
db/yaml/workflows/pytorch-mnist-training/20210118175809.yaml
Normal file
@@ -0,0 +1,207 @@
|
||||
# source: https://github.com/onepanelio/templates/blob/master/workflows/pytorch-mnist-training/
|
||||
arguments:
|
||||
parameters:
|
||||
- name: epochs
|
||||
value: '10'
|
||||
- displayName: Node pool
|
||||
hint: Name of node pool or group to run this workflow task
|
||||
type: select.nodepool
|
||||
name: sys-node-pool
|
||||
value: {{.DefaultNodePoolOption}}
|
||||
visibility: public
|
||||
required: true
|
||||
entrypoint: main
|
||||
templates:
|
||||
- name: main
|
||||
dag:
|
||||
tasks:
|
||||
- name: train-model
|
||||
template: train-model
|
||||
- name: train-model
|
||||
# Indicates that we want to push files in /mnt/output to object storage
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: output
|
||||
path: /mnt/output
|
||||
optional: true
|
||||
script:
|
||||
image: onepanel/dl:0.17.0
|
||||
command:
|
||||
- python
|
||||
- '-u'
|
||||
source: |
|
||||
import json
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
from torchvision import datasets, transforms
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
def __init__(self):
|
||||
super(Net, self).__init__()
|
||||
self.conv1 = nn.Conv2d(1, 32, 3, 1)
|
||||
self.conv2 = nn.Conv2d(32, 64, 3, 1)
|
||||
self.dropout1 = nn.Dropout(0.25)
|
||||
self.dropout2 = nn.Dropout(0.5)
|
||||
self.fc1 = nn.Linear(9216, 128)
|
||||
self.fc2 = nn.Linear(128, 10)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = F.relu(x)
|
||||
x = self.conv2(x)
|
||||
x = F.relu(x)
|
||||
x = F.max_pool2d(x, 2)
|
||||
x = self.dropout1(x)
|
||||
x = torch.flatten(x, 1)
|
||||
x = self.fc1(x)
|
||||
x = F.relu(x)
|
||||
x = self.dropout2(x)
|
||||
x = self.fc2(x)
|
||||
output = F.log_softmax(x, dim=1)
|
||||
return output
|
||||
|
||||
|
||||
def train(model, device, train_loader, optimizer, epoch, batch_size, writer):
|
||||
model.train()
|
||||
for batch_idx, (data, target) in enumerate(train_loader):
|
||||
data, target = data.to(device), target.to(device)
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = F.nll_loss(output, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
if batch_idx % 10 == 0:
|
||||
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
||||
epoch, batch_idx * len(data), len(train_loader.dataset),
|
||||
100. * batch_idx / len(train_loader), loss.item()))
|
||||
|
||||
writer.add_scalar('training loss', loss.item(), epoch)
|
||||
|
||||
|
||||
def test(model, device, test_loader, epoch, writer):
|
||||
model.eval()
|
||||
test_loss = 0
|
||||
correct = 0
|
||||
with torch.no_grad():
|
||||
for data, target in test_loader:
|
||||
data, target = data.to(device), target.to(device)
|
||||
output = model(data)
|
||||
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
|
||||
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
|
||||
correct += pred.eq(target.view_as(pred)).sum().item()
|
||||
|
||||
loss = test_loss / len(test_loader.dataset)
|
||||
accuracy = correct / len(test_loader.dataset)
|
||||
|
||||
print('\nTest set: Average loss: {}, Accuracy: {}\n'.format(
|
||||
loss, accuracy))
|
||||
|
||||
# Store metrics for this task
|
||||
metrics = [
|
||||
{'name': 'accuracy', 'value': accuracy},
|
||||
{'name': 'loss', 'value': loss}
|
||||
]
|
||||
with open('/tmp/sys-metrics.json', 'w') as f:
|
||||
json.dump(metrics, f)
|
||||
|
||||
|
||||
def main(params):
|
||||
writer = SummaryWriter(log_dir='/mnt/output/tensorboard')
|
||||
|
||||
use_cuda = torch.cuda.is_available()
|
||||
|
||||
torch.manual_seed(params['seed'])
|
||||
|
||||
device = torch.device('cuda' if use_cuda else 'cpu')
|
||||
|
||||
train_kwargs = {'batch_size': params['batch_size']}
|
||||
test_kwargs = {'batch_size': params['test_batch_size']}
|
||||
if use_cuda:
|
||||
cuda_kwargs = {'num_workers': 1,
|
||||
'pin_memory': True,
|
||||
'shuffle': True}
|
||||
train_kwargs.update(cuda_kwargs)
|
||||
test_kwargs.update(cuda_kwargs)
|
||||
|
||||
transform=transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))
|
||||
])
|
||||
dataset1 = datasets.MNIST('/mnt/data', train=True, download=True,
|
||||
transform=transform)
|
||||
dataset2 = datasets.MNIST('/mnt/data', train=False,
|
||||
transform=transform)
|
||||
train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
|
||||
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
|
||||
|
||||
model = Net().to(device)
|
||||
optimizer = optim.Adadelta(model.parameters(), lr=params['lr'])
|
||||
|
||||
scheduler = StepLR(optimizer, step_size=1, gamma=params['gamma'])
|
||||
for epoch in range(1, params['epochs'] + 1):
|
||||
train(model, device, train_loader, optimizer, epoch, params['batch_size'], writer)
|
||||
test(model, device, test_loader, epoch, writer)
|
||||
scheduler.step()
|
||||
|
||||
# Save model
|
||||
torch.save(model.state_dict(), '/mnt/output/model.pt')
|
||||
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
params = {
|
||||
'seed': 1,
|
||||
'batch_size': 64,
|
||||
'test_batch_size': 1000,
|
||||
'epochs': {{workflow.parameters.epochs}},
|
||||
'lr': 0.001,
|
||||
'gamma': 0.7,
|
||||
}
|
||||
main(params)
|
||||
volumeMounts:
|
||||
# TensorBoard sidecar will automatically mount these volumes
|
||||
# The `data` volume is mounted for saving datasets
|
||||
# The `output` volume is mounted to save model output and share TensorBoard logs
|
||||
- name: data
|
||||
mountPath: /mnt/data
|
||||
- name: output
|
||||
mountPath: /mnt/output
|
||||
nodeSelector:
|
||||
{{.NodePoolLabel}}: '{{workflow.parameters.sys-node-pool}}'
|
||||
sidecars:
|
||||
- name: tensorboard
|
||||
image: onepanel/dl:0.17.0
|
||||
command:
|
||||
- sh
|
||||
- '-c'
|
||||
env:
|
||||
- name: ONEPANEL_INTERACTIVE_SIDECAR
|
||||
value: 'true'
|
||||
args:
|
||||
# Read logs from /mnt/output - this directory is auto-mounted from volumeMounts
|
||||
- tensorboard --logdir /mnt/output/tensorboard
|
||||
ports:
|
||||
- containerPort: 6006
|
||||
name: tensorboard
|
||||
volumeClaimTemplates:
|
||||
# Provision volumes for storing data and output
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
- metadata:
|
||||
name: output
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
118
db/yaml/workflows/tensorflow-mnist-training/20210118175809.yaml
Normal file
118
db/yaml/workflows/tensorflow-mnist-training/20210118175809.yaml
Normal file
@@ -0,0 +1,118 @@
|
||||
# source: https://github.com/onepanelio/templates/blob/master/workflows/tensorflow-mnist-training/
|
||||
arguments:
|
||||
parameters:
|
||||
- name: epochs
|
||||
value: '10'
|
||||
- displayName: Node pool
|
||||
hint: Name of node pool or group to run this workflow task
|
||||
type: select.nodepool
|
||||
name: sys-node-pool
|
||||
value: {{.DefaultNodePoolOption}}
|
||||
visibility: public
|
||||
required: true
|
||||
entrypoint: main
|
||||
templates:
|
||||
- name: main
|
||||
dag:
|
||||
tasks:
|
||||
- name: train-model
|
||||
template: train-model
|
||||
- name: train-model
|
||||
# Indicates that we want to push files in /mnt/output to object storage
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: output
|
||||
path: /mnt/output
|
||||
optional: true
|
||||
script:
|
||||
image: onepanel/dl:0.17.0
|
||||
command:
|
||||
- python
|
||||
- '-u'
|
||||
source: |
|
||||
import json
|
||||
import tensorflow as tf
|
||||
|
||||
mnist = tf.keras.datasets.mnist
|
||||
|
||||
(x_train, y_train),(x_test, y_test) = mnist.load_data()
|
||||
x_train, x_test = x_train / 255.0, x_test / 255.0
|
||||
x_train = x_train[..., tf.newaxis]
|
||||
x_test = x_test[..., tf.newaxis]
|
||||
|
||||
model = tf.keras.Sequential([
|
||||
tf.keras.layers.Conv2D(filters=32, kernel_size=5, activation='relu'),
|
||||
tf.keras.layers.MaxPool2D(pool_size=2),
|
||||
tf.keras.layers.Conv2D(filters=64, kernel_size=5, activation='relu'),
|
||||
tf.keras.layers.MaxPool2D(pool_size=2),
|
||||
tf.keras.layers.Flatten(),
|
||||
tf.keras.layers.Dense(units=124, activation='relu'),
|
||||
tf.keras.layers.Dropout(rate=0.75),
|
||||
tf.keras.layers.Dense(units=10, activation='softmax')
|
||||
])
|
||||
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
|
||||
loss='sparse_categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
# Write TensorBoard logs to /mnt/output
|
||||
log_dir = '/mnt/output/tensorboard/'
|
||||
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
|
||||
|
||||
model.fit(x=x_train,
|
||||
y=y_train,
|
||||
epochs={{workflow.parameters.epochs}},
|
||||
validation_data=(x_test, y_test),
|
||||
callbacks=[tensorboard_callback])
|
||||
|
||||
# Store metrics for this task
|
||||
loss, accuracy = model.evaluate(x_test, y_test)
|
||||
metrics = [
|
||||
{'name': 'accuracy', 'value': accuracy},
|
||||
{'name': 'loss', 'value': loss}
|
||||
]
|
||||
with open('/tmp/sys-metrics.json', 'w') as f:
|
||||
json.dump(metrics, f)
|
||||
|
||||
# Save model
|
||||
model.save('/mnt/output/model.h5')
|
||||
volumeMounts:
|
||||
# TensorBoard sidecar will automatically mount these volumes
|
||||
# The `data` volume is mounted to support Keras datasets
|
||||
# The `output` volume is mounted to save model output and share TensorBoard logs
|
||||
- name: data
|
||||
mountPath: /home/root/.keras/datasets
|
||||
- name: output
|
||||
mountPath: /mnt/output
|
||||
nodeSelector:
|
||||
{{.NodePoolLabel}}: '{{workflow.parameters.sys-node-pool}}'
|
||||
sidecars:
|
||||
- name: tensorboard
|
||||
image: onepanel/dl:0.17.0
|
||||
command:
|
||||
- sh
|
||||
- '-c'
|
||||
env:
|
||||
- name: ONEPANEL_INTERACTIVE_SIDECAR
|
||||
value: 'true'
|
||||
args:
|
||||
# Read logs from /mnt/output - this directory is auto-mounted from volumeMounts
|
||||
- tensorboard --logdir /mnt/output/tensorboard
|
||||
ports:
|
||||
- containerPort: 6006
|
||||
name: tensorboard
|
||||
volumeClaimTemplates:
|
||||
# Provision volumes for storing data and output
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
- metadata:
|
||||
name: output
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
@@ -0,0 +1,260 @@
|
||||
# source: https://github.com/onepanelio/templates/blob/master/workflows/tf-object-detection-training/
|
||||
arguments:
|
||||
parameters:
|
||||
- name: cvat-annotation-path
|
||||
value: 'artifacts/{{workflow.namespace}}/annotations/'
|
||||
hint: Path to annotated data (COCO format) in default object storage. In CVAT, this parameter will be pre-populated.
|
||||
displayName: Dataset path
|
||||
visibility: internal
|
||||
|
||||
- name: val-split
|
||||
value: 10
|
||||
displayName: Validation split size
|
||||
type: input.number
|
||||
visibility: public
|
||||
hint: Enter validation set size in percentage of full dataset. (0 - 100)
|
||||
|
||||
- name: num-augmentation-cycles
|
||||
value: 1
|
||||
displayName: Number of augmentation cycles
|
||||
type: input.number
|
||||
visibility: public
|
||||
hint: Number of augmentation cycles, zero means no data augmentation
|
||||
|
||||
- name: preprocessing-parameters
|
||||
value: |-
|
||||
RandomBrightnessContrast:
|
||||
p: 0.2
|
||||
GaussianBlur:
|
||||
p: 0.3
|
||||
GaussNoise:
|
||||
p: 0.4
|
||||
HorizontalFlip:
|
||||
p: 0.5
|
||||
VerticalFlip:
|
||||
p: 0.3
|
||||
displayName: Preprocessing parameters
|
||||
visibility: public
|
||||
type: textarea.textarea
|
||||
hint: 'See <a href="https://albumentations.ai/docs/api_reference/augmentations/transforms/" target="_blank">documentation</a> for more information on parameters.'
|
||||
|
||||
- name: cvat-model
|
||||
value: frcnn-res50-coco
|
||||
displayName: Model
|
||||
hint: TF Detection API's model to use for training.
|
||||
type: select.select
|
||||
visibility: public
|
||||
options:
|
||||
- name: 'Faster RCNN-ResNet 101-COCO'
|
||||
value: frcnn-res101-coco
|
||||
- name: 'Faster RCNN-ResNet 101-Low Proposal-COCO'
|
||||
value: frcnn-res101-low
|
||||
- name: 'Faster RCNN-ResNet 50-COCO'
|
||||
value: frcnn-res50-coco
|
||||
- name: 'Faster RCNN-NAS-COCO'
|
||||
value: frcnn-nas-coco
|
||||
- name: 'SSD MobileNet V1-COCO'
|
||||
value: ssd-mobilenet-v1-coco2
|
||||
- name: 'SSD MobileNet V2-COCO'
|
||||
value: ssd-mobilenet-v2-coco
|
||||
- name: 'SSDLite MobileNet-COCO'
|
||||
value: ssdlite-mobilenet-coco
|
||||
|
||||
- name: cvat-num-classes
|
||||
value: '10'
|
||||
hint: Number of classes. In CVAT, this parameter will be pre-populated.
|
||||
displayName: Number of classes
|
||||
visibility: internal
|
||||
|
||||
- name: hyperparameters
|
||||
value: |-
|
||||
num_steps: 10000
|
||||
displayName: Hyperparameters
|
||||
visibility: public
|
||||
type: textarea.textarea
|
||||
hint: 'See <a href="https://docs.onepanel.ai/docs/reference/cvat/built-in-models#tfod-hyperparameters" target="_blank">documentation</a> for more information on parameters.'
|
||||
|
||||
- name: dump-format
|
||||
value: cvat_coco
|
||||
displayName: CVAT dump format
|
||||
visibility: private
|
||||
|
||||
- name: cvat-finetune-checkpoint
|
||||
value: ''
|
||||
hint: Path to the last fine-tune checkpoint for this model in default object storage. Leave empty if this is the first time you're training this model.
|
||||
displayName: Checkpoint path
|
||||
visibility: public
|
||||
|
||||
- name: tf-image
|
||||
value: tensorflow/tensorflow:1.13.1-py3
|
||||
type: select.select
|
||||
displayName: Select TensorFlow image
|
||||
visibility: public
|
||||
hint: Select the GPU image if you are running on a GPU node pool
|
||||
options:
|
||||
- name: 'TensorFlow 1.13.1 CPU Image'
|
||||
value: 'tensorflow/tensorflow:1.13.1-py3'
|
||||
- name: 'TensorFlow 1.13.1 GPU Image'
|
||||
value: 'tensorflow/tensorflow:1.13.1-gpu-py3'
|
||||
|
||||
- displayName: Node pool
|
||||
hint: Name of node pool or group to run this workflow task
|
||||
type: select.nodepool
|
||||
name: sys-node-pool
|
||||
value: {{.DefaultNodePoolOption}}
|
||||
visibility: public
|
||||
required: true
|
||||
|
||||
entrypoint: main
|
||||
templates:
|
||||
- dag:
|
||||
tasks:
|
||||
- name: preprocessing
|
||||
template: preprocessing
|
||||
- name: train-model
|
||||
template: tensorflow
|
||||
dependencies: [preprocessing]
|
||||
arguments:
|
||||
artifacts:
|
||||
- name: data
|
||||
from: "{{tasks.preprocessing.outputs.artifacts.processed-data}}"
|
||||
name: main
|
||||
- container:
|
||||
args:
|
||||
- |
|
||||
apt-get update && \
|
||||
apt-get install -y python3-pip git wget unzip libglib2.0-0 libsm6 libxext6 libxrender-dev && \
|
||||
pip install --upgrade pip && \
|
||||
pip install pillow lxml Cython contextlib2 matplotlib numpy scipy pycocotools pyyaml test-generator && \
|
||||
cd /mnt/src/tf/research && \
|
||||
export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim && \
|
||||
mkdir -p /mnt/src/protoc && \
|
||||
wget -P /mnt/src/protoc https://github.com/protocolbuffers/protobuf/releases/download/v3.10.1/protoc-3.10.1-linux-x86_64.zip && \
|
||||
cd /mnt/src/protoc/ && \
|
||||
unzip protoc-3.10.1-linux-x86_64.zip && \
|
||||
cd /mnt/src/tf/research/ && \
|
||||
/mnt/src/protoc/bin/protoc object_detection/protos/*.proto --python_out=. && \
|
||||
cd /mnt/src/train/workflows/tf-object-detection-training && \
|
||||
python main.py \
|
||||
--extras="{{workflow.parameters.hyperparameters}}" \
|
||||
--model="{{workflow.parameters.cvat-model}}" \
|
||||
--num_classes="{{workflow.parameters.cvat-num-classes}}" \
|
||||
--sys_finetune_checkpoint="{{workflow.parameters.cvat-finetune-checkpoint}}" \
|
||||
--from_preprocessing=True
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
image: '{{workflow.parameters.tf-image}}'
|
||||
volumeMounts:
|
||||
- mountPath: /mnt/data
|
||||
name: processed-data
|
||||
- mountPath: /mnt/output
|
||||
name: output
|
||||
workingDir: /mnt/src
|
||||
nodeSelector:
|
||||
{{.NodePoolLabel}}: '{{workflow.parameters.sys-node-pool}}'
|
||||
inputs:
|
||||
artifacts:
|
||||
- name: data
|
||||
path: /mnt/data/datasets/
|
||||
- name: models
|
||||
path: /mnt/data/models/
|
||||
optional: true
|
||||
s3:
|
||||
key: '{{workflow.parameters.cvat-finetune-checkpoint}}'
|
||||
- git:
|
||||
repo: https://github.com/tensorflow/models.git
|
||||
revision: v1.13.0
|
||||
name: src
|
||||
path: /mnt/src/tf
|
||||
- git:
|
||||
repo: https://github.com/onepanelio/templates.git
|
||||
revision: v0.18.0
|
||||
name: tsrc
|
||||
path: /mnt/src/train
|
||||
name: tensorflow
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: model
|
||||
optional: true
|
||||
path: /mnt/output
|
||||
sidecars:
|
||||
- name: tensorboard
|
||||
image: '{{workflow.parameters.tf-image}}'
|
||||
command:
|
||||
- sh
|
||||
- '-c'
|
||||
env:
|
||||
- name: ONEPANEL_INTERACTIVE_SIDECAR
|
||||
value: 'true'
|
||||
args:
|
||||
# Read logs from /mnt/output - this directory is auto-mounted from volumeMounts
|
||||
- tensorboard --logdir /mnt/output/checkpoints/
|
||||
ports:
|
||||
- containerPort: 6006
|
||||
name: tensorboard
|
||||
- container:
|
||||
args:
|
||||
- |
|
||||
pip install --upgrade pip &&\
|
||||
pip install opencv-python albumentations tqdm pyyaml pycocotools && \
|
||||
cd /mnt/src/preprocessing/workflows/albumentations-preprocessing && \
|
||||
python -u main.py \
|
||||
--data_aug_params="{{workflow.parameters.preprocessing-parameters}}" \
|
||||
--format="tfrecord" \
|
||||
--val_split={{workflow.parameters.val-split}} \
|
||||
--aug_steps={{workflow.parameters.num-augmentation-cycles}}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
image: '{{workflow.parameters.tf-image}}'
|
||||
volumeMounts:
|
||||
- mountPath: /mnt/data
|
||||
name: data
|
||||
- mountPath: /mnt/output
|
||||
name: processed-data
|
||||
workingDir: /mnt/src
|
||||
nodeSelector:
|
||||
{{.NodePoolLabel}}: '{{workflow.parameters.sys-node-pool}}'
|
||||
inputs:
|
||||
artifacts:
|
||||
- name: data
|
||||
path: /mnt/data/datasets/
|
||||
s3:
|
||||
key: '{{workflow.parameters.cvat-annotation-path}}'
|
||||
- git:
|
||||
repo: https://github.com/onepanelio/templates.git
|
||||
revision: v0.18.0
|
||||
name: src
|
||||
path: /mnt/src/preprocessing
|
||||
name: preprocessing
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: processed-data
|
||||
optional: true
|
||||
path: /mnt/output
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 200Gi
|
||||
- metadata:
|
||||
name: processed-data
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 200Gi
|
||||
- metadata:
|
||||
name: output
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 200Gi
|
134
db/yaml/workspaces/cvat/20210129134326.yaml
Normal file
134
db/yaml/workspaces/cvat/20210129134326.yaml
Normal file
@@ -0,0 +1,134 @@
|
||||
containers:
|
||||
- name: cvat-db
|
||||
image: postgres:10-alpine
|
||||
env:
|
||||
- name: POSTGRES_USER
|
||||
value: root
|
||||
- name: POSTGRES_DB
|
||||
value: cvat
|
||||
- name: POSTGRES_HOST_AUTH_METHOD
|
||||
value: trust
|
||||
- name: PGDATA
|
||||
value: /var/lib/psql/data
|
||||
ports:
|
||||
- containerPort: 5432
|
||||
name: tcp
|
||||
volumeMounts:
|
||||
- name: db
|
||||
mountPath: /var/lib/psql
|
||||
- name: cvat-redis
|
||||
image: redis:4.0-alpine
|
||||
ports:
|
||||
- containerPort: 6379
|
||||
name: tcp
|
||||
- name: cvat
|
||||
image: onepanel/cvat:0.18.0_cvat.1.0.0
|
||||
env:
|
||||
- name: DJANGO_MODWSGI_EXTRA_ARGS
|
||||
value: ""
|
||||
- name: ALLOWED_HOSTS
|
||||
value: '*'
|
||||
- name: CVAT_REDIS_HOST
|
||||
value: localhost
|
||||
- name: CVAT_POSTGRES_HOST
|
||||
value: localhost
|
||||
- name: CVAT_SHARE_URL
|
||||
value: /cvat/data
|
||||
- name: CVAT_SHARE_DIR
|
||||
value: /share
|
||||
- name: CVAT_DATA_DIR
|
||||
value: /cvat/data
|
||||
- name: CVAT_MEDIA_DATA_DIR
|
||||
value: /cvat/data/data
|
||||
- name: CVAT_KEYS_DIR
|
||||
value: /cvat/data/keys
|
||||
- name: CVAT_MODELS_DIR
|
||||
value: /cvat/data/models
|
||||
- name: CVAT_LOGS_DIR
|
||||
value: /cvat/logs
|
||||
- name: CVAT_ANNOTATIONS_OBJECT_STORAGE_PREFIX
|
||||
value: 'artifacts/$(ONEPANEL_RESOURCE_NAMESPACE)/annotations/'
|
||||
- name: CVAT_ONEPANEL_WORKFLOWS_LABEL
|
||||
value: 'key=used-by,value=cvat'
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: all
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: compute,utility
|
||||
- name: NVIDIA_REQUIRE_CUDA
|
||||
value: "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411"
|
||||
- name: ONEPANEL_MAIN_CONTAINER
|
||||
value: 'true'
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
volumeMounts:
|
||||
- name: cvat-data
|
||||
mountPath: /cvat
|
||||
- name: share
|
||||
mountPath: /share
|
||||
- name: sys-namespace-config
|
||||
mountPath: /etc/onepanel
|
||||
readOnly: true
|
||||
- name: cvat-ui
|
||||
image: onepanel/cvat-ui:0.18.0_cvat.1.0.0
|
||||
ports:
|
||||
- containerPort: 80
|
||||
name: http
|
||||
- name: sys-filesyncer
|
||||
image: onepanel/filesyncer:0.18.0
|
||||
imagePullPolicy: Always
|
||||
args:
|
||||
- server
|
||||
- -server-prefix=/sys/filesyncer
|
||||
volumeMounts:
|
||||
- name: share
|
||||
mountPath: /share
|
||||
- name: sys-namespace-config
|
||||
mountPath: /etc/onepanel
|
||||
readOnly: true
|
||||
ports:
|
||||
- name: cvat-ui
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: 80
|
||||
- name: cvat
|
||||
port: 8080
|
||||
protocol: TCP
|
||||
targetPort: 8080
|
||||
- name: fs
|
||||
port: 8888
|
||||
protocol: TCP
|
||||
targetPort: 8888
|
||||
routes:
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /sys/filesyncer
|
||||
route:
|
||||
- destination:
|
||||
port:
|
||||
number: 8888
|
||||
- match:
|
||||
- uri:
|
||||
regex: /api/.*|/git/.*|/tensorflow/.*|/onepanelio/.*|/tracking/.*|/auto_annotation/.*|/analytics/.*|/static/.*|/admin/.*|/documentation/.*|/dextr/.*|/reid/.*
|
||||
- queryParams:
|
||||
id:
|
||||
regex: \d+.*
|
||||
route:
|
||||
- destination:
|
||||
port:
|
||||
number: 8080
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /
|
||||
route:
|
||||
- destination:
|
||||
port:
|
||||
number: 80
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: db
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
99
db/yaml/workspaces/jupyterlab/20210129142057.yaml
Normal file
99
db/yaml/workspaces/jupyterlab/20210129142057.yaml
Normal file
@@ -0,0 +1,99 @@
|
||||
containers:
|
||||
- name: jupyterlab
|
||||
image: onepanel/dl:0.17.0
|
||||
command: ["/bin/bash", "-c", "pip install onepanel-sdk && start.sh LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 jupyter lab --LabApp.token='' --LabApp.allow_remote_access=True --LabApp.allow_origin=\"*\" --LabApp.disable_check_xsrf=True --LabApp.trust_xheaders=True --LabApp.base_url=/ --LabApp.tornado_settings='{\"headers\":{\"Content-Security-Policy\":\"frame-ancestors * 'self'\"}}' --notebook-dir='/data' --allow-root"]
|
||||
workingDir: /data
|
||||
env:
|
||||
- name: tornado
|
||||
value: "'{'headers':{'Content-Security-Policy':\"frame-ancestors\ *\ 'self'\"}}'"
|
||||
- name: TENSORBOARD_PROXY_URL
|
||||
value: '//$(ONEPANEL_RESOURCE_UID)--$(ONEPANEL_RESOURCE_NAMESPACE).$(ONEPANEL_DOMAIN)/tensorboard'
|
||||
ports:
|
||||
- containerPort: 8888
|
||||
name: jupyterlab
|
||||
- containerPort: 6006
|
||||
name: tensorboard
|
||||
- containerPort: 8080
|
||||
name: nni
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
lifecycle:
|
||||
postStart:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- >
|
||||
condayml="/data/.environment.yml";
|
||||
jupytertxt="/data/.jupexported.txt";
|
||||
if [ -f "$condayml" ]; then conda env update -f $condayml; fi;
|
||||
if [ -f "$jupytertxt" ]; then cat $jupytertxt | xargs -n 1 jupyter labextension install --no-build && jupyter lab build --minimize=False; fi;
|
||||
preStop:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- >
|
||||
conda env export > /data/.environment.yml -n base;
|
||||
jupyter labextension list 1>/dev/null 2> /data/.jup.txt;
|
||||
cat /data/.jup.txt | sed -n '2,$p' | awk 'sub(/v/,"@", $2){print $1$2}' > /data/.jupexported.txt;
|
||||
- name: sys-filesyncer
|
||||
image: onepanel/filesyncer:0.18.0
|
||||
imagePullPolicy: Always
|
||||
args:
|
||||
- server
|
||||
- -host=localhost:8889
|
||||
- -server-prefix=/sys/filesyncer
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: sys-namespace-config
|
||||
mountPath: /etc/onepanel
|
||||
readOnly: true
|
||||
ports:
|
||||
- name: jupyterlab
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: 8888
|
||||
- name: tensorboard
|
||||
port: 6006
|
||||
protocol: TCP
|
||||
targetPort: 6006
|
||||
- name: nni
|
||||
port: 8080
|
||||
protocol: TCP
|
||||
targetPort: 8080
|
||||
- name: fs
|
||||
port: 8889
|
||||
protocol: TCP
|
||||
targetPort: 8889
|
||||
routes:
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /sys/filesyncer
|
||||
route:
|
||||
- destination:
|
||||
port:
|
||||
number: 8889
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /tensorboard
|
||||
route:
|
||||
- destination:
|
||||
port:
|
||||
number: 6006
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /nni
|
||||
route:
|
||||
- destination:
|
||||
port:
|
||||
number: 8080
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /
|
||||
route:
|
||||
- destination:
|
||||
port:
|
||||
number: 80
|
65
db/yaml/workspaces/vscode/20210129152427.yaml
Normal file
65
db/yaml/workspaces/vscode/20210129152427.yaml
Normal file
@@ -0,0 +1,65 @@
|
||||
containers:
|
||||
- name: vscode
|
||||
image: onepanel/vscode:1.0.0
|
||||
command: ["/bin/bash", "-c", "pip install onepanel-sdk && /usr/bin/entrypoint.sh --bind-addr 0.0.0.0:8080 --auth none ."]
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: vscode
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
lifecycle:
|
||||
postStart:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- >
|
||||
condayml="/data/.environment.yml";
|
||||
vscodetxt="/data/.vscode-extensions.txt";
|
||||
if [ -f "$condayml" ]; then conda env update -f $condayml; fi;
|
||||
if [ -f "$vscodetxt" ]; then cat $vscodetxt | xargs -n 1 code-server --install-extension; fi;
|
||||
preStop:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- >
|
||||
conda env export > /data/.environment.yml -n base;
|
||||
code-server --list-extensions | tail -n +2 > /data/.vscode-extensions.txt;
|
||||
- name: sys-filesyncer
|
||||
image: onepanel/filesyncer:0.18.0
|
||||
imagePullPolicy: Always
|
||||
args:
|
||||
- server
|
||||
- -server-prefix=/sys/filesyncer
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: sys-namespace-config
|
||||
mountPath: /etc/onepanel
|
||||
readOnly: true
|
||||
ports:
|
||||
- name: vscode
|
||||
port: 8080
|
||||
protocol: TCP
|
||||
targetPort: 8080
|
||||
- name: fs
|
||||
port: 8888
|
||||
protocol: TCP
|
||||
targetPort: 8888
|
||||
routes:
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /sys/filesyncer
|
||||
route:
|
||||
- destination:
|
||||
port:
|
||||
number: 8888
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /
|
||||
route:
|
||||
- destination:
|
||||
port:
|
||||
number: 8080
|
Reference in New Issue
Block a user