Convert Manual QA Projects into Workflow Projects

The following script converts a Manual QA Project into a Workflow Project. A .txt log file containing all Label Rows that can not be copied is created when the script is run.

❗️

CRITICAL INFORMATION

You must create a Workflow template in the Encord platform to use this script.

ℹ️

Note

  • Project collaborators are not copied. Collaborators must be added to the target Workflow Project manually after it is created using this script.
  • Tasks in the Complete stage of the source Project are copied into the Complete stage of the Workflow Project. Source Project tasks in any stage other than Complete are copied into the Annotate stage of the Workflow Project.

In the main function of the following script, ensure that you:

  • Replace <private_key_path> with the path to the file containing your private key.
  • Replace <source_project_hash> with your source Project's hash.
  • Replace <target_project_name> with the name you want to give to your target Workflow Project.
  • Replace <workflow_template_hash> with the ID of the Workflow template you want to use for the target Project.

Optionally you can:

  • Adjust the Bundle size. The Bundle size in the following script is set to 100, the maximum bundle size.
  • Include the hashes of all objects and classifications you want to copy to the target Project. The default None copies all objects and classifications from the source Project.
# Import dependencies
import argparse
import time
import logging
from typing import List, Optional
from encord import EncordUserClient, Project
from tqdm import tqdm

# Create target workflow Project
def create_target_project(
    user_client: EncordUserClient,
    source_project_hash: str,
    target_project_name: str,
    workflow_template_hash: str,
) -> Project:
    # Retrieve all the required information you need to create your target project from the source Project
    source_p = user_client.get_project(source_project_hash)
    source_dataset_hashes = {x["dataset_hash"] for x in source_p.datasets}
    ontology_hash = source_p.ontology_hash

    target_project_hash: str = user_client.create_project(
        target_project_name,
        list(source_dataset_hashes),
        ontology_hash=ontology_hash,
        workflow_template_hash=workflow_template_hash,
    )

    return user_client.get_project(target_project_hash)


# Migrate Projects using bundles to facilitate the migration of large Projects
def process_by_bundle(
    source_project: Project,
    target_project: Project,
    bundle_size: int = 100,
    feature_hashes_to_include: Optional[List[str]] = None,
) -> List[str]:
    new_problematic_label_rows = []
    all_source_label_rows = source_project.list_label_rows_v2()

    all_batches = [
        all_source_label_rows[i : i + bundle_size]
        for i in range(0, len(all_source_label_rows), bundle_size)
    ]
    for source_rows_batch in tqdm(all_batches):
        source_bundle = source_project.create_bundle()
        for label_row_s in source_rows_batch:
            if feature_hashes_to_include:
                label_row_s.initialise_labels(
                    include_object_feature_hashes=set(feature_hashes_to_include),
                    bundle=source_bundle,
                )
            else:
                label_row_s.initialise_labels(
                    bundle=source_bundle,
                )

        source_bundle.execute()

        target_label_rows = {}
        target_bundle = target_project.create_bundle()
        for label_row_s in source_rows_batch:
            matches = target_project.list_label_rows_v2(
                data_hashes=[label_row_s.data_hash]
            )
            if len(matches) != 1:
                logging.info(
                    f"Something went wrong, zero or multiple matches found {matches}"
                )
                # Track problematic label rows
                new_problematic_label_rows.append(label_row_s.data_hash)
                continue
            label_row_t = matches[0]
            label_row_t.initialise_labels(bundle=target_bundle, overwrite=True)
            target_label_rows[label_row_s.data_hash] = label_row_t
        target_bundle.execute()

        target_bundle = target_project.create_bundle()

        for label_row_s in source_rows_batch:
            label_row_t = target_label_rows[label_row_s.data_hash]
            label_row_s_status = label_row_s.annotation_task_status
            if label_row_s_status == "COMPLETED":
                label_row_t.workflow_complete()

            for obj in label_row_s.get_object_instances():
                label_row_t.add_object_instance(obj.copy())

            for cl in label_row_s.get_classification_instances():
                label_row_t.add_classification_instance(cl.copy())
            label_row_t.save(bundle=target_bundle)
        target_bundle.execute()

    return new_problematic_label_rows

# Define the main function
def main(
    keyfile: str,
    source_project_hash: str,
    target_project_name: str,
    workflow_template_hash: str,
    bundle_size: int = 50,
    feature_hashes_to_include: Optional[List[str]] = None,
):
    user_client = EncordUserClient.create_with_ssh_private_key(
        ssh_private_key_path=keyfile
    )
    target_project = create_target_project(
        user_client, source_project_hash, target_project_name, workflow_template_hash
    )
    source_project = user_client.get_project(source_project_hash)

    # Process label rows by bundle
    problematic_label_rows = process_by_bundle(
        source_project, target_project, bundle_size, feature_hashes_to_include
    )

    # Write problematic label rows to a log file
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    sanitized_project_name = target_project_name.replace(" ", "_")
    log_filename = (
        f"problematic_label_rows_log_{sanitized_project_name}_{timestamp}.txt"
    )
    with open(log_filename, "w") as log_file:
        for data_hash in problematic_label_rows:
            log_file.write(f"{data_hash}\n")
    print("Done!")
    print(
        f"Access project on https://app.encord.com/projects/view/{target_project.project_hash}/summary"
    )

# Run the main function. Insert your parameters here
if __name__ == "__main__":
    keyfile = "<private_key_path>"
    source_project_hash = "<source_project_hash>"
    target_project_name = "<target_project_name>"
    workflow_template_hash = "<workflow_template_hash> "
    bundle_size = 50
    feature_hashes_to_include = None
    
    main(keyfile, source_project_hash, target_project_name, workflow_template_hash, bundle_size, feature_hashes_to_include)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.57s/it]
Done!
Access project on https://app.encord.com/projects/view/<target_project_hash>/summary