Source code for cloudos_cli.utils.array_job

import re
import sys
from cloudos_cli.utils.errors import BadRequestException



[docs]
def is_valid_regex(s):
    """
    Validates whether the given string is a valid regular expression.

    Parameters
    ----------
    s : str
        The string to be checked as a regular expression.

    Returns
    -------
    bool
        True if the string is a valid regular expression, False otherwise.
    """
    try:
        re.compile(s)
        return True
    except re.error:
        return False



[docs]
def is_glob_pattern(s):
    """
    Check if a given string contains glob pattern characters.

    Glob patterns are commonly used for filename matching and include
    special characters such as '*', '?', and '['.

    Parameters
    ----------
    s : str
        The string to check for glob pattern characters.

    Returns
    -------
    bool
        True if the string contains any glob pattern characters, otherwise False.
    """
    return any(char in s for char in "*?[")



[docs]
def is_probably_regex(s):
    """
    Determines if a given string is likely a regular expression.

    This function checks whether the input string matches common patterns
    that are indicative of regular expressions. It first validates the
    string using `is_valid_regex(s)` and then searches for specific regex
    indicators such as quantifiers, character classes, anchors, and
    alternation.

    Parameters
    ----------
    s : str
        The string to evaluate.

    Returns
    -------
    bool
        True if the string is likely a regular expression, False otherwise.

    Notes
    -----
    The function assumes the existence of `is_valid_regex(s)` which
        validates whether the input string is a valid regex.
    """
    if not is_valid_regex(s):
        return False

    # Patterns that usually indicate actual regex use (not just file names)
    regex_indicators = [
        r"\.\*", r"\.\+", r"\\[dws]", r"\[[^\]]+\]", r"\([^\)]+\)",
        r"\{\d+(,\d*)?\}", r"\^", r"\$", r"\|"
    ]
    return any(re.search(pat, s) for pat in regex_indicators)



[docs]
def classify_pattern(s):
    """
    Classifies a given string pattern into one of three categories: "regex", "glob", or "exact".

    Parameters
    ----------
    s : str
        The string pattern to classify.

    Returns
    -------
    str: A string indicating the type of pattern:
        - "regex" if the pattern is likely a regular expression.
        - "glob" if the pattern matches glob-style syntax.
        - "exact" if the pattern does not match regex or glob syntax.
    """
    if is_probably_regex(s):
        return "regex"
    elif is_glob_pattern(s):
        return "glob"
    else:
        return "exact"



[docs]
def generate_datasets_for_project(cloudos_url, apikey, workspace_id, project_name, verify_ssl):
    """
    Generate datasets for a specified project in a CloudOS workspace.

    This function initializes a `Datasets` object for the given project and handles
    potential errors such as missing project elements or unauthorized API calls.

    Parameters
    ----------
    cloudos_url : str
        The URL of the CloudOS instance.
    apikey : str
        The API key for authentication.
    workspace_id : str
        The ID of the workspace where the project resides.
    project_name : str
        The name of the project for which datasets are generated.
    verify_ssl : bool
        Whether to verify SSL certificates during API calls.

    Returns
    -------
    Datasets
        An instance of the `Datasets` class initialized for the specified project.

    Raises
    ------
    ValueError
        If the specified project is not found in the workspace.
    BadRequestException
        If the API call is unauthorized or encounters other issues.
    """

    # this avoids circular import error if import is added at the top
    from cloudos_cli.datasets import Datasets
    try:
        ds = Datasets(
            cloudos_url=cloudos_url,
            apikey=apikey,
            workspace_id=workspace_id,
            project_name=project_name,
            verify=verify_ssl,
            cromwell_token=None
        )
    except ValueError:
        print(f"No {project_name} element in projects was found")
        sys.exit(1)

    except BadRequestException as e:
        if 'Forbidden' in str(e):
            print('It seems your call is not authorised. Please check if ' +
                  'your workspace is restricted by Airlock and if your API key is valid.')
            sys.exit(1)
        else:
            raise e

    return ds



[docs]
def get_file_or_folder_id(cloudos_url, apikey, workspace_id, project_name, verify_ssl, command_dir, command_name, is_file=True):
    """Retrieve the ID of a specific file or folder within a CloudOS workspace.

    Parameters
    ----------
    cloudos_url : str
        The base URL of the CloudOS API.
    apikey : str
        The API key for authenticating requests to the CloudOS API.
    workspace_id : str
        The ID of the workspace containing the project.
    project_name : str
        The name of the project within the workspace.
    verify_ssl : bool
        Whether to verify SSL certificates for the API requests.
    name : str
        The name of the file or folder whose ID is to be retrieved.
    is_file : bool, optional
        Whether to retrieve a file ID (True) or folder ID (False). Default is True.

    Returns
    -------
    str: The ID of the specified file or folder.

    Raises
    ------
    ValueError
        If the specified file or folder is not found.
    Exception
        If there is an error during the API interaction or data retrieval.

    Notes
    -----
    - This function uses the `generate_datasets_for_project` function to create a Datasets object for the specified project.
    - The `list_folder_content` method is used for files, and `list_project_content` is used for folders.
    - The function assumes that the IDs are stored in the `"_id"` field of the metadata.
    """
    # create a Datasets() class
    ds = generate_datasets_for_project(cloudos_url, apikey, workspace_id, project_name, verify_ssl)

    if is_file:
        # get all files from a folder
        content = ds.list_folder_content(command_dir)
        for file in content['files']:
            if file.get("name") == command_name:
                return file.get("_id", '')
        raise ValueError(f"File '{command_name}' not found in directory '{command_dir}'.")
    else:
        # get all folders from the project
        # check if the command_dir has a sub-folder
        if len(command_dir.split("/")) > 1:
            # get the first folder which is just below the project
            folders = ds.list_folder_content(command_dir.split("/")[0])
            # use the last folder as is listed in the first folder
            folder_to_search = command_dir.split("/")[-1]
        else:
            folders = ds.list_project_content()
            folder_to_search = command_dir

        for folder in folders['folders']:
            if folder.get("name") == folder_to_search:
                return folder.get("_id", '')
        raise ValueError(f"Folder '{folder_to_search}' not found in project.")



[docs]
def extract_project(path):
    """
    Extracts the project name and the remaining path from a given file path.

    The function assumes that a "project" exists if the path contains at least three parts
    when split by slashes. If the path has fewer than three parts, the project name is
    considered empty, and the entire path is returned as the remaining path.

    Parameters
    ----------
    path : str
        The file path to process.

    Returns
    -------
    tuple: A tuple containing:
        - str: The project name (empty string if no project exists).
        - str: The remaining path after the project name.
    """
    # Strip slashes and split the path
    parts = path.strip("/").split("/")
    # A "project" exists only if there are at least 3 parts
    # globs needs more than 3 parts i.e. PROJECT/Data/Downloads/*.csv
    if (len(parts) >= 3 and not is_glob_pattern(path)) or \
       (len(parts) > 3 and is_glob_pattern(path)):
        # Return the first part as project name and the rest as remaining path
        return parts[0], "/".join(parts[1:])
    else:
        # project is empty, use the project_name of the function
        return "", "/".join(parts)