Skip to content

Data Connector Validation Documentation

Documentation for the terrakit.validate modules.


pipeline_model

PipelineModel

Bases: BaseModel

A model for configuring the TerraKit Pipeline. This class defines the attributes common across all pipeline steps.

Attributes:

Name Type Description
dataset_name str

Name of the dataset. Default is "terrakit_curated_dataset".

working_dir Path

Working directory for the pipeline. Default is "./tmp". The directory is created if it does not already exist.

Source code in terrakit/validate/pipeline_model.py
class PipelineModel(BaseModel):
    """
    A model for configuring the TerraKit Pipeline. This class defines the attributes common
    across all pipeline steps.

    Attributes:
        dataset_name (str): Name of the dataset. Default is "terrakit_curated_dataset".
        working_dir (Path): Working directory for the pipeline. Default is "./tmp". The directory is created if it does not already exist.
    """

    dataset_name: str = "terrakit_curated_dataset"
    working_dir: Path = Path("./tmp")

    @field_validator("dataset_name", mode="before")
    def check_dataset_name(cls, v):
        """
        Validate that the dataset_name does not contain special characters.

        Args:
            v (str): The dataset name to validate.

        Returns:
            str: The validated dataset name.
        """

        return v

    @field_validator("working_dir", mode="before")
    def check_working_dir(cls, v) -> Path:
        """
        Validate and create the working directory if it does not exist.

        Args:
            v (Path): The working directory path.

        Returns:
            Path: The validated and existing working directory path.

        Raises:
            ValueError: If the provided path is not a directory.
        """
        if v is None:
            v = "./tmp"
        pathname = Path(v)
        logging.debug(f"Working directory set to: {pathname}")
        if not pathname.exists():
            logging.info(f"Creating working directory: {pathname}")
            pathname.mkdir(parents=True, exist_ok=True)
        elif os.path.isdir(pathname) is False:
            raise ValueError(
                f"Working directory must be a path, not a file: 'working_dir' set to {v}"
            )
        return pathname

check_dataset_name

Validate that the dataset_name does not contain special characters.

Parameters:

Name Type Description Default
v str

The dataset name to validate.

required

Returns:

Name Type Description
str

The validated dataset name.

Source code in terrakit/validate/pipeline_model.py
@field_validator("dataset_name", mode="before")
def check_dataset_name(cls, v):
    """
    Validate that the dataset_name does not contain special characters.

    Args:
        v (str): The dataset name to validate.

    Returns:
        str: The validated dataset name.
    """

    return v

check_working_dir

Validate and create the working directory if it does not exist.

Parameters:

Name Type Description Default
v Path

The working directory path.

required

Returns:

Name Type Description
Path Path

The validated and existing working directory path.

Raises:

Type Description
ValueError

If the provided path is not a directory.

Source code in terrakit/validate/pipeline_model.py
@field_validator("working_dir", mode="before")
def check_working_dir(cls, v) -> Path:
    """
    Validate and create the working directory if it does not exist.

    Args:
        v (Path): The working directory path.

    Returns:
        Path: The validated and existing working directory path.

    Raises:
        ValueError: If the provided path is not a directory.
    """
    if v is None:
        v = "./tmp"
    pathname = Path(v)
    logging.debug(f"Working directory set to: {pathname}")
    if not pathname.exists():
        logging.info(f"Creating working directory: {pathname}")
        pathname.mkdir(parents=True, exist_ok=True)
    elif os.path.isdir(pathname) is False:
        raise ValueError(
            f"Working directory must be a path, not a file: 'working_dir' set to {v}"
        )
    return pathname

pipeline_model_validation

Validate the TerraKit Pipeline model configuration.

Parameters:

Name Type Description Default
dataset_name str

Name of the dataset.

required
working_dir str

Working directory for the pipeline.

required

Returns:

Name Type Description
PipelineModel

The validated PipelineModel instance.

Raises:

Type Description
TerrakitValidationError

If the provided arguments are invalid.

Source code in terrakit/validate/pipeline_model.py
def pipeline_model_validation(dataset_name: str, working_dir: str):
    """
    Validate the TerraKit Pipeline model configuration.

    Args:
        dataset_name (str): Name of the dataset.
        working_dir (str): Working directory for the pipeline.

    Returns:
        PipelineModel: The validated PipelineModel instance.

    Raises:
        TerrakitValidationError: If the provided arguments are invalid.
    """
    try:
        parent_params = PipelineModel(
            dataset_name=dataset_name, working_dir=working_dir
        )
        pipeline_model = PipelineModel.model_validate(parent_params)
    except ValidationError as e:
        for error in e.errors():
            logging.error(
                f"Invalid parent arguments: {error['msg']}. \n\t'{error['loc'][0]}' currently set to '{error['input']}'. Please update to a valid entry."
            )
        raise TerrakitValidationError(
            "Invalid parent arguments", details=e.errors()
        ) from e
    logging.info(f"Processing with parent arguments: {pipeline_model}")
    return pipeline_model

labels_model

LabelsModel

Bases: BaseModel

Model for configuration of the process labels TerraKit pipeline step.

Attributes:

Name Type Description
model_config ConfigDict

Configuration dictionary for the model.

labels_folder Path

Path to the folder containing label files.

active bool

Indicates if the labels step is active. Default is True.

label_type Literal['vector']

Type of labels, currently only 'vector' is supported. Default is 'vector'.

datetime_info Literal['filename', 'csv']

Specifies how datetime information is stored, either by 'filename' or 'csv'. Default is 'filename'.

Source code in terrakit/validate/labels_model.py
class LabelsModel(BaseModel):
    """
    Model for configuration of the process labels TerraKit pipeline step.

    Attributes:
        model_config (ConfigDict): Configuration dictionary for the model.
        labels_folder (Path): Path to the folder containing label files.
        active (bool): Indicates if the labels step is active. Default is True.
        label_type (Literal["vector"]): Type of labels, currently only 'vector' is supported. Default is 'vector'.
        datetime_info (Literal["filename", "csv"]): Specifies how datetime information is stored, either by 'filename' or 'csv'. Default is 'filename'.
    """

    model_config = ConfigDict(from_attributes=True)

    labels_folder: Path
    active: bool = True
    label_type: Literal["vector", "raster"] = "vector"
    datetime_info: Literal["filename", "csv"] = "filename"

    @field_validator("labels_folder", mode="after")
    def check_labels_folder(cls, v):
        """
        Validates that the labels_folder exists, is not empty, and contains at least one supported file.

        Raises:
            ValueError: If the labels_folder does not exist, is empty, or does not contain any supported files.
        """
        if os.path.exists(v) is False:
            raise ValueError(
                f"Labels folder '{v}' does not exist. Please provide a valid labels folder"
            )

        if os.listdir(v) == 0:
            raise ValueError(
                f"Labels folder '{v}' does not contain any files. Please provide a valid labels folder with at least one labels file"
            )

        return v

    @field_validator("label_type", mode="after")
    def check_labels_type(cls, v, info: ValidationInfo):
        labels_folder = info.data.get("labels_folder")

        check_for_valid_type = False
        valid_file_type = ""
        if v == "vector":
            valid_file_type = "json"
            for filename in os.listdir(labels_folder):
                if filename.endswith(valid_file_type):
                    check_for_valid_type = True

        if v == "raster":
            valid_file_type = "tif"
            for filename in os.listdir(labels_folder):
                if filename.endswith(valid_file_type):
                    check_for_valid_type = True

        if check_for_valid_type is False:
            raise ValueError(
                f"Labels folder '{labels_folder}' does not contain any supported files. Please provide a valid labels folder with at least one valid .{valid_file_type} file."
            )
        return v

    @field_validator("datetime_info", mode="before")
    def check_datetime_info(cls, v):
        """
        Placeholder for future validation of datetime_info.

        Currently, no specific checks are implemented for datetime_info.
        """

        return v

check_labels_folder

Validates that the labels_folder exists, is not empty, and contains at least one supported file.

Raises:

Type Description
ValueError

If the labels_folder does not exist, is empty, or does not contain any supported files.

Source code in terrakit/validate/labels_model.py
@field_validator("labels_folder", mode="after")
def check_labels_folder(cls, v):
    """
    Validates that the labels_folder exists, is not empty, and contains at least one supported file.

    Raises:
        ValueError: If the labels_folder does not exist, is empty, or does not contain any supported files.
    """
    if os.path.exists(v) is False:
        raise ValueError(
            f"Labels folder '{v}' does not exist. Please provide a valid labels folder"
        )

    if os.listdir(v) == 0:
        raise ValueError(
            f"Labels folder '{v}' does not contain any files. Please provide a valid labels folder with at least one labels file"
        )

    return v

check_datetime_info

Placeholder for future validation of datetime_info.

Currently, no specific checks are implemented for datetime_info.

Source code in terrakit/validate/labels_model.py
@field_validator("datetime_info", mode="before")
def check_datetime_info(cls, v):
    """
    Placeholder for future validation of datetime_info.

    Currently, no specific checks are implemented for datetime_info.
    """

    return v

terrakit.validate.download_model

DateAllowance

Bases: BaseModel

Model for specifying date allowance around the target date.

Attributes:

Name Type Description
pre_days int

Number of days before the target date to include. Default is 0.

post_days int

Number of days after the target date to include. Default is 7.

Example
from terrakit.validate.download_model import DateAllowance

date_allowance = DateAllowance(
    pre_days = 0, post_days = 21
)
Source code in terrakit/validate/download_model.py
class DateAllowance(BaseModel):
    """
    Model for specifying date allowance around the target date.

    Attributes:
        pre_days (int): Number of days before the target date to include. Default is 0.
        post_days (int): Number of days after the target date to include. Default is 7.

    Example:
        ```python
        from terrakit.validate.download_model import DateAllowance

        date_allowance = DateAllowance(
            pre_days = 0, post_days = 21
        )
        ```
    """

    pre_days: int = 0
    post_days: int = 7

Transform

Bases: BaseModel

Model for specifying data transformation options.

Attributes:

Name Type Description
scale_data_xarray bool

Whether to scale the data using xarray. Default is True.

impute_nans bool

Whether to impute NaN values. Default is True.

reproject bool

Whether to reproject the data. Default is True.

Example
from terrakit.validate.download_model import Transfrom

transform = Transform(
    scale_data_xarray=True,
    impute_nans=True,
    reproject=True,
)
Source code in terrakit/validate/download_model.py
class Transform(BaseModel):
    """
    Model for specifying data transformation options.

    Attributes:
        scale_data_xarray (bool): Whether to scale the data using xarray. Default is True.
        impute_nans (bool): Whether to impute NaN values. Default is True.
        reproject (bool): Whether to reproject the data. Default is True.

    Example:
        ```python
        from terrakit.validate.download_model import Transfrom

        transform = Transform(
            scale_data_xarray=True,
            impute_nans=True,
            reproject=True,
        )
        ```
    """

    scale_data_xarray: bool = True
    impute_nans: bool = True
    reproject: bool = True
    """ >>> INCLUDE NEW TRANSFORMATIONS HERE <<< 
    <new_transformation_option>: bool = False
    """

reproject: bool = True class-attribute instance-attribute

INCLUDE NEW TRANSFORMATIONS HERE <<< : bool = False

DataSource

Bases: BaseModel

Model for specifying data source configuration.

Attributes:

Name Type Description
data_connector str

The data connector to use. Default is "sentinel_aws".

collection_name str

The collection name to download. Default is "sentinel-2-l2a".

bands list[str]

The bands to download. Default is ["blue", "green", "red"].

save_file str | None

The file path to save the downloaded data. Default is None.

Example
from terrakit.validate.download_model import DataSource

data_source = DataSource(
    data_connector = "sentinel_aws",
    collection_name = "sentinel-2-l2a",
    bands = ["blue", "green", "red"],
    save_file = "",
)
Source code in terrakit/validate/download_model.py
class DataSource(BaseModel):
    """
    Model for specifying data source configuration.

    Attributes:
        data_connector (str): The data connector to use. Default is "sentinel_aws".
        collection_name (str): The collection name to download. Default is "sentinel-2-l2a".
        bands (list[str]): The bands to download. Default is ["blue", "green", "red"].
        save_file (str | None): The file path to save the downloaded data. Default is None.

    Example:
        ```python
        from terrakit.validate.download_model import DataSource

        data_source = DataSource(
            data_connector = "sentinel_aws",
            collection_name = "sentinel-2-l2a",
            bands = ["blue", "green", "red"],
            save_file = "",
        )
        ```
    """

    data_connector: str = "sentinel_aws"
    collection_name: str = "sentinel-2-l2a"
    bands: list[str] = ["blue", "green", "red"]
    save_file: str | None = None

DownloadModel

Bases: BaseModel

Model for configuring the download process.

Attributes:

Name Type Description
model_config ConfigDict

Configuration dictionary.

transform Transform

Transformation options.

date_allowance DateAllowance

Date allowance around the target date.

active bool

Whether the download step is active. Default is True.

max_cloud_cover int

Maximum cloud cover allowed. Default is 80.

keep_files bool

Whether to keep redundent shapefiles. Default is False.

datetime_bbox_shp_file str

File path for datetime bounding box shapefile. Default is "./terrakit_curated_dataset_all_bboxes.shp".

labels_shp_file str

File path for labels shapefile. Default is "./tmp/terrakit_curated_dataset_labels.shp".

data_sources list[DataSource]

List of data sources to download. Default is an empty list.

Source code in terrakit/validate/download_model.py
class DownloadModel(BaseModel):
    """
    Model for configuring the download process.

    Attributes:
        model_config (ConfigDict): Configuration dictionary.
        transform (Transform): Transformation options.
        date_allowance (DateAllowance): Date allowance around the target date.
        active (bool): Whether the download step is active. Default is True.
        max_cloud_cover (int): Maximum cloud cover allowed. Default is 80.
        keep_files (bool): Whether to keep redundent shapefiles. Default is False.
        datetime_bbox_shp_file (str): File path for datetime bounding box shapefile. Default is "./terrakit_curated_dataset_all_bboxes.shp".
        labels_shp_file (str): File path for labels shapefile. Default is "./tmp/terrakit_curated_dataset_labels.shp".
        data_sources (list[DataSource]): List of data sources to download. Default is an empty list.
    """

    model_config = ConfigDict(from_attributes=True)

    transform: Transform
    date_allowance: DateAllowance
    active: bool = True
    max_cloud_cover: int = 80
    keep_files: bool = False
    datetime_bbox_shp_file: str = "./tmp/terrakit_curated_dataset_all_bboxes.shp"
    labels_shp_file: str = "./tmp/terrakit_curated_dataset_labels.shp"
    data_sources: list[DataSource] = Field(default_factory=list)

tiling_model


terrakit.validate.data_connector

ConnectorType

Bases: BaseModel

Attributes:

Name Type Description
connector_type Literal

The type of connector to be use to download data.

Example

terrakit.DataConnector(connector_type="nasa_earthdata")
or
terrakit.DataConnector({"connector_type": "nasa_earthdata"})

Source code in terrakit/validate/data_connector.py
class ConnectorType(BaseModel):
    """
    Attributes:
        connector_type (Literal): The type of connector to be use to download data.

    Example:
        ```
        terrakit.DataConnector(connector_type="nasa_earthdata")
        ```
        or
        ```
        terrakit.DataConnector({"connector_type": "nasa_earthdata"})
        ```
    """

    connector_type: Literal[
        "nasa_earthdata",
        "sentinelhub",
        "sentinel_aws",
        "IBMResearchSTAC",
        "TheWeatherCompany",
    ]
    """The type of connector to be use to download data. nasa_earthdata, sentinelhub, sentinel_aws, IBMResearchSTAC or TheWeatherCompany"""

connector_type: Literal['nasa_earthdata', 'sentinelhub', 'sentinel_aws', 'IBMResearchSTAC', 'TheWeatherCompany'] instance-attribute

The type of connector to be use to download data. nasa_earthdata, sentinelhub, sentinel_aws, IBMResearchSTAC or TheWeatherCompany


helpers

check_collection_exists

Check if the provided data_collection_name exists in the collections list.

Parameters:

Name Type Description Default
data_collection_name str

The name of the collection to check.

required
collections list

A list of available collections.

required

Raises:

Type Description
TerrakitValueError

If the collection does not exist.

Source code in terrakit/validate/helpers.py
def check_collection_exists(data_collection_name: str, collections: list):
    """
    Check if the provided data_collection_name exists in the collections list.

    Parameters:
        data_collection_name (str): The name of the collection to check.
        collections (list): A list of available collections.

    Raises:
        TerrakitValueError: If the collection does not exist.
    """
    if data_collection_name not in collections:
        error_msg = f"Invalid collection '{data_collection_name}'. Please choose from one of the following collection {collections}"
        logger.error(error_msg)
        raise TerrakitValueError(error_msg)

check_start_end_date

Validate the start and end dates ensuring the end date is after the start date.

Parameters:

Name Type Description Default
date_start str

The start date in ISO format (YYYY-MM-DD).

required
date_end str

The end date in ISO format (YYYY-MM-DD).

required

Raises:

Type Description
TerrakitValueError

If the date range is invalid.

Source code in terrakit/validate/helpers.py
def check_start_end_date(date_start: str, date_end: str) -> None:
    """
    Validate the start and end dates ensuring the end date is after the start date.

    Parameters:
        date_start (str): The start date in ISO format (YYYY-MM-DD).
        date_end (str): The end date in ISO format (YYYY-MM-DD).

    Raises:
        TerrakitValueError: If the date range is invalid.
    """
    check_datetime(start=True, date_str=date_start)
    check_datetime(start=False, date_str=date_end)

    start = date.fromisoformat(date_start)
    end = date.fromisoformat(date_end)
    delta = end - start
    if delta.days < 0:
        err_msg = f"Invalid date range: {date_start} to {date_end}. End date must be greater than start date."
        logger.error(err_msg)
        raise TerrakitValueError(err_msg)

check_datetime

Validate a date string ensuring it's in ISO format and not in the future.

Parameters:

Name Type Description Default
start bool

True if validating the start date, False for end date.

required
date_str str

The date string to validate.

required

Raises:

Type Description
TerrakitValueError

If the date format is incorrect or the date is in the future.

Source code in terrakit/validate/helpers.py
def check_datetime(start: bool, date_str: str) -> None:
    """
    Validate a date string ensuring it's in ISO format and not in the future.

    Parameters:
        start (bool): True if validating the start date, False for end date.
        date_str (str): The date string to validate.

    Raises:
        TerrakitValueError: If the date format is incorrect or the date is in the future.
    """
    if start:
        start_or_end = "start"
    else:
        start_or_end = "end"
    try:
        query_date = date.fromisoformat(date_str)
    except ValueError as e:
        err_msg = f"Invalid {start_or_end} date format: {date_str}. Please use ISO format (YYYY-MM-DD)."
        logger.error(err_msg)
        raise TerrakitValueError(err_msg, e)  # type: ignore [arg-type]

    if query_date > datetime.date(datetime.now()):
        err_msg = f"Invalid {start_or_end} date: {date_str}. Date must be in the past."
        logger.error(err_msg)
        raise TerrakitValueError(
            err_msg,
        )
    if query_date < datetime.strptime("01/01/1950", "%d/%m/%Y").date():
        err_msg = (
            f"Invalid {start_or_end} date: {date_str}. Date must be after 01/01/1950."
        )
        logger.error(err_msg)
        raise TerrakitValueError(
            err_msg,
        )

check_area_polygon

For connector_types that do not yet support 'area_polygon', this function provides a check to use 'bbox' instead.

Parameters:

Name Type Description Default
area_polygon

The area polygon to check.

required
connector_type str

The type of connector.

required

Raises:

Type Description
TerrakitValueError

If 'area_polygon' is provided instead of 'bbox'.

Source code in terrakit/validate/helpers.py
def check_area_polygon(area_polygon, connector_type: str) -> None:
    """
    For connector_types that do not yet support 'area_polygon', this function provides a check to use 'bbox' instead.

    Parameters:
        area_polygon: The area polygon to check.
        connector_type (str): The type of connector.

    Raises:
        TerrakitValueError: If 'area_polygon' is provided instead of 'bbox'.
    """
    if area_polygon is not None:
        err_msg = f"Error: Issue finding data from {connector_type}. Please use 'bbox' instead of 'area_polygon'"
        logger.error(err_msg)
        raise TerrakitValueError(err_msg)

check_bbox

Validate the bounding box ensuring it's a list of four floats and not a degenerate rectangle.

Parameters:

Name Type Description Default
bbox list

The bounding box to check.

required
connector_type str

The type of connector.

required

Raises:

Type Description
TerrakitValueError

If the bounding box is invalid.

Source code in terrakit/validate/helpers.py
def check_bbox(bbox: list, connector_type: str) -> None:
    """
    Validate the bounding box ensuring it's a list of four floats and not a degenerate rectangle.

    Parameters:
        bbox (list): The bounding box to check.
        connector_type (str): The type of connector.

    Raises:
        TerrakitValueError: If the bounding box is invalid.
    """
    if bbox is None:
        error_msg = f"Error: Issue finding data from {connector_type}. Please specify at least one of 'bbox' and 'area_polygon'"
        logger.error(error_msg)
        raise TerrakitValueError(error_msg)
    if isinstance(bbox, list) is False:
        err_msg = f"Error: Issue finding data from {connector_type} with bbox '{bbox}'. Please specify 'bbox' as a list of floats."
        logger.error(err_msg)
        raise TerrakitValueError(err_msg)
    if len(bbox) != 4:
        err_msg = f"Error: Issue finding data from {connector_type} with bbox '{bbox}'. Please specify 'bbox' as a list of length 4."
        logger.error(err_msg)
        raise TerrakitValueError(err_msg)
    for item in bbox:
        try:
            float(item)
        except ValueError:
            err_msg = f"Error: Issue finding data from {connector_type} with bbox '{bbox}'. Please specify 'bbox' as a list of floats. The entry '{item}' is not a float."
            logger.error(err_msg)
            raise TerrakitValueError(err_msg)
    if len(set(bbox)) == 1:
        err_msg = f"Error: Issue finding data from {connector_type} with bbox '{bbox}'. Cannot determine area from 'bbox'. Please specify a valid area."
        logger.error(err_msg)
        raise TerrakitValueError(err_msg)
    west, south, east, north = bbox
    if not (-180 <= west < east <= 180 and -90 <= south < north <= 90):
        raise TerrakitValueError(
            f"Error: Issue finding data from {connector_type} with bbox '{bbox}'. Bbox is expected as 'west, south, east, north' or 'minx, miny, maxx, maxy' using EPSG: 4326 coordinate system."
        )