Module library.config

Expand source code
import importlib
import json
from datetime import datetime

import requests
import yaml
from jinja2 import Template

from .utils import format_url


# Custom dumper created for list indentation
class Dumper(yaml.Dumper):
    def increase_indent(self, flow=False, indentless=False):
        return super(Dumper, self).increase_indent(flow, False)


class Config:
    """
    Config will take a configuration file from the templates directly
    or any given configuration file and compute/output a configuration
    file to pass into the Ingestor
    """

    def __init__(self, path: str, version: str = None):
        self.path = path
        self.version = version

    @property
    def unparsed_unrendered_template(self) -> str:
        """importing the yml file into a string"""
        with open(self.path, "r") as f:
            return f.read()

    @property
    def parsed_unrendered_template(self) -> dict:
        """parsing unrendered template into a dictionary"""
        return yaml.safe_load(self.unparsed_unrendered_template)

    def parsed_rendered_template(self, **kwargs) -> dict:
        """render template, then parse into a dictionary"""
        template = Template(self.unparsed_unrendered_template)
        return yaml.safe_load(template.render(**kwargs))

    @property
    def source_type(self) -> str:
        """determine the type of the source, either url, socrata or script"""
        template = self.parsed_unrendered_template
        source = template["dataset"]["source"]
        return list(source.keys())[0]

    def version_socrata(self, uid: str) -> str:
        """using the socrata API, collect the 'data last update' date"""
        metadata = requests.get(
            f"https://data.cityofnewyork.us/api/views/{uid}.json"
        ).json()
        version = datetime.fromtimestamp(metadata["rowsUpdatedAt"]).strftime("%Y%m%d")
        return version

    # @property
    # def version_bytes(self) -> str:
    #     """parsing bytes of the big apple to get the latest bytes version"""
    #     # scrape from bytes to get a version
    #     return None

    def valid_version(self, version: str) -> bool:
        """check that a version name is valid"""
        return "{" not in version and "}" not in version

    @property
    def version_today(self) -> str:
        """
        set today as the version name - for use with unspecified
        or invalid versions
        """
        return datetime.today().strftime("%Y%m%d")

    @property
    def compute(self) -> dict:
        """based on given yml file, compute the configuration"""
        if self.source_type == "script":
            if self.version:
                version = self.version
            else:
                version = self.version_today
            config = self.parsed_rendered_template(version=version)
            _config = self.parsed_unrendered_template

            script_name = _config["dataset"]["source"]["script"]
            module = importlib.import_module(f"library.script.{script_name}")
            scriptor = module.Scriptor()
            url = scriptor.runner()

            options = config["dataset"]["source"]["options"]
            geometry = config["dataset"]["source"]["geometry"]
            config["dataset"]["source"] = {
                "url": {"path": url, "subpath": ""},
                "options": options,
                "geometry": geometry,
            }

        if self.source_type == "url":
            # Load unrendered template to check for yml-specified
            # version (_version)
            _config = self.parsed_unrendered_template
            _version = _config["dataset"]["version"]

            # If a custom version specified from CLI, take custom version
            if self.version:
                version = self.version

            # If no custom version specified and version in config
            # is valid, take config version (_version)
            if not self.version and self.valid_version(_version):
                version = _version

            # If no custom version and no config version,
            # assign today as version
            if not self.version and not self.valid_version(_version):
                version = self.version_today

            # Render template
            config = self.parsed_rendered_template(version=version)

            # Force overwrite of yml version with appropriate version
            config["dataset"]["version"] = version

        if self.source_type == "socrata":
            # For socrata we are computing the url and add the url object to the config file
            _uid = self.parsed_unrendered_template["dataset"]["source"]["socrata"][
                "uid"
            ]
            _format = self.parsed_unrendered_template["dataset"]["source"]["socrata"][
                "format"
            ]
            config = self.parsed_rendered_template(version=self.version_socrata(_uid))

            if _format == "csv":
                url = f"https://data.cityofnewyork.us/api/views/{_uid}/rows.csv"
            if _format == "geojson":
                url = f"https://nycopendata.socrata.com/api/geospatial/{_uid}?method=export&format=GeoJSON"

            options = config["dataset"]["source"]["options"]
            geometry = config["dataset"]["source"]["geometry"]
            config["dataset"]["source"] = {
                "url": {"path": url, "subpath": ""},
                "options": options,
                "geometry": geometry,
            }

        path = config["dataset"]["source"]["url"]["path"]
        subpath = config["dataset"]["source"]["url"]["subpath"]
        config["dataset"]["source"]["url"]["gdalpath"] = format_url(path, subpath)
        return config

    @property
    def compute_json(self) -> str:
        return json.dumps(self.compute, indent=4)

    @property
    def compute_yml(self) -> str:
        return yaml.dump(
            self.compute,
            Dumper=Dumper,
            default_flow_style=False,
            sort_keys=False,
            indent=2,
        )

    @property
    def compute_parsed(self) -> (dict, dict, dict, dict):
        config = self.compute
        dataset = config["dataset"]
        source = dataset["source"]
        destination = dataset["destination"]
        info = dataset["info"]
        return dataset, source, destination, info

Classes

class Config (path: str, version: str = None)

Config will take a configuration file from the templates directly or any given configuration file and compute/output a configuration file to pass into the Ingestor

Expand source code
class Config:
    """
    Config will take a configuration file from the templates directly
    or any given configuration file and compute/output a configuration
    file to pass into the Ingestor
    """

    def __init__(self, path: str, version: str = None):
        self.path = path
        self.version = version

    @property
    def unparsed_unrendered_template(self) -> str:
        """importing the yml file into a string"""
        with open(self.path, "r") as f:
            return f.read()

    @property
    def parsed_unrendered_template(self) -> dict:
        """parsing unrendered template into a dictionary"""
        return yaml.safe_load(self.unparsed_unrendered_template)

    def parsed_rendered_template(self, **kwargs) -> dict:
        """render template, then parse into a dictionary"""
        template = Template(self.unparsed_unrendered_template)
        return yaml.safe_load(template.render(**kwargs))

    @property
    def source_type(self) -> str:
        """determine the type of the source, either url, socrata or script"""
        template = self.parsed_unrendered_template
        source = template["dataset"]["source"]
        return list(source.keys())[0]

    def version_socrata(self, uid: str) -> str:
        """using the socrata API, collect the 'data last update' date"""
        metadata = requests.get(
            f"https://data.cityofnewyork.us/api/views/{uid}.json"
        ).json()
        version = datetime.fromtimestamp(metadata["rowsUpdatedAt"]).strftime("%Y%m%d")
        return version

    # @property
    # def version_bytes(self) -> str:
    #     """parsing bytes of the big apple to get the latest bytes version"""
    #     # scrape from bytes to get a version
    #     return None

    def valid_version(self, version: str) -> bool:
        """check that a version name is valid"""
        return "{" not in version and "}" not in version

    @property
    def version_today(self) -> str:
        """
        set today as the version name - for use with unspecified
        or invalid versions
        """
        return datetime.today().strftime("%Y%m%d")

    @property
    def compute(self) -> dict:
        """based on given yml file, compute the configuration"""
        if self.source_type == "script":
            if self.version:
                version = self.version
            else:
                version = self.version_today
            config = self.parsed_rendered_template(version=version)
            _config = self.parsed_unrendered_template

            script_name = _config["dataset"]["source"]["script"]
            module = importlib.import_module(f"library.script.{script_name}")
            scriptor = module.Scriptor()
            url = scriptor.runner()

            options = config["dataset"]["source"]["options"]
            geometry = config["dataset"]["source"]["geometry"]
            config["dataset"]["source"] = {
                "url": {"path": url, "subpath": ""},
                "options": options,
                "geometry": geometry,
            }

        if self.source_type == "url":
            # Load unrendered template to check for yml-specified
            # version (_version)
            _config = self.parsed_unrendered_template
            _version = _config["dataset"]["version"]

            # If a custom version specified from CLI, take custom version
            if self.version:
                version = self.version

            # If no custom version specified and version in config
            # is valid, take config version (_version)
            if not self.version and self.valid_version(_version):
                version = _version

            # If no custom version and no config version,
            # assign today as version
            if not self.version and not self.valid_version(_version):
                version = self.version_today

            # Render template
            config = self.parsed_rendered_template(version=version)

            # Force overwrite of yml version with appropriate version
            config["dataset"]["version"] = version

        if self.source_type == "socrata":
            # For socrata we are computing the url and add the url object to the config file
            _uid = self.parsed_unrendered_template["dataset"]["source"]["socrata"][
                "uid"
            ]
            _format = self.parsed_unrendered_template["dataset"]["source"]["socrata"][
                "format"
            ]
            config = self.parsed_rendered_template(version=self.version_socrata(_uid))

            if _format == "csv":
                url = f"https://data.cityofnewyork.us/api/views/{_uid}/rows.csv"
            if _format == "geojson":
                url = f"https://nycopendata.socrata.com/api/geospatial/{_uid}?method=export&format=GeoJSON"

            options = config["dataset"]["source"]["options"]
            geometry = config["dataset"]["source"]["geometry"]
            config["dataset"]["source"] = {
                "url": {"path": url, "subpath": ""},
                "options": options,
                "geometry": geometry,
            }

        path = config["dataset"]["source"]["url"]["path"]
        subpath = config["dataset"]["source"]["url"]["subpath"]
        config["dataset"]["source"]["url"]["gdalpath"] = format_url(path, subpath)
        return config

    @property
    def compute_json(self) -> str:
        return json.dumps(self.compute, indent=4)

    @property
    def compute_yml(self) -> str:
        return yaml.dump(
            self.compute,
            Dumper=Dumper,
            default_flow_style=False,
            sort_keys=False,
            indent=2,
        )

    @property
    def compute_parsed(self) -> (dict, dict, dict, dict):
        config = self.compute
        dataset = config["dataset"]
        source = dataset["source"]
        destination = dataset["destination"]
        info = dataset["info"]
        return dataset, source, destination, info

Instance variables

var compute : dict

based on given yml file, compute the configuration

Expand source code
@property
def compute(self) -> dict:
    """based on given yml file, compute the configuration"""
    if self.source_type == "script":
        if self.version:
            version = self.version
        else:
            version = self.version_today
        config = self.parsed_rendered_template(version=version)
        _config = self.parsed_unrendered_template

        script_name = _config["dataset"]["source"]["script"]
        module = importlib.import_module(f"library.script.{script_name}")
        scriptor = module.Scriptor()
        url = scriptor.runner()

        options = config["dataset"]["source"]["options"]
        geometry = config["dataset"]["source"]["geometry"]
        config["dataset"]["source"] = {
            "url": {"path": url, "subpath": ""},
            "options": options,
            "geometry": geometry,
        }

    if self.source_type == "url":
        # Load unrendered template to check for yml-specified
        # version (_version)
        _config = self.parsed_unrendered_template
        _version = _config["dataset"]["version"]

        # If a custom version specified from CLI, take custom version
        if self.version:
            version = self.version

        # If no custom version specified and version in config
        # is valid, take config version (_version)
        if not self.version and self.valid_version(_version):
            version = _version

        # If no custom version and no config version,
        # assign today as version
        if not self.version and not self.valid_version(_version):
            version = self.version_today

        # Render template
        config = self.parsed_rendered_template(version=version)

        # Force overwrite of yml version with appropriate version
        config["dataset"]["version"] = version

    if self.source_type == "socrata":
        # For socrata we are computing the url and add the url object to the config file
        _uid = self.parsed_unrendered_template["dataset"]["source"]["socrata"][
            "uid"
        ]
        _format = self.parsed_unrendered_template["dataset"]["source"]["socrata"][
            "format"
        ]
        config = self.parsed_rendered_template(version=self.version_socrata(_uid))

        if _format == "csv":
            url = f"https://data.cityofnewyork.us/api/views/{_uid}/rows.csv"
        if _format == "geojson":
            url = f"https://nycopendata.socrata.com/api/geospatial/{_uid}?method=export&format=GeoJSON"

        options = config["dataset"]["source"]["options"]
        geometry = config["dataset"]["source"]["geometry"]
        config["dataset"]["source"] = {
            "url": {"path": url, "subpath": ""},
            "options": options,
            "geometry": geometry,
        }

    path = config["dataset"]["source"]["url"]["path"]
    subpath = config["dataset"]["source"]["url"]["subpath"]
    config["dataset"]["source"]["url"]["gdalpath"] = format_url(path, subpath)
    return config
var compute_json : str
Expand source code
@property
def compute_json(self) -> str:
    return json.dumps(self.compute, indent=4)
var compute_parsed : ()
Expand source code
@property
def compute_parsed(self) -> (dict, dict, dict, dict):
    config = self.compute
    dataset = config["dataset"]
    source = dataset["source"]
    destination = dataset["destination"]
    info = dataset["info"]
    return dataset, source, destination, info
var compute_yml : str
Expand source code
@property
def compute_yml(self) -> str:
    return yaml.dump(
        self.compute,
        Dumper=Dumper,
        default_flow_style=False,
        sort_keys=False,
        indent=2,
    )
var parsed_unrendered_template : dict

parsing unrendered template into a dictionary

Expand source code
@property
def parsed_unrendered_template(self) -> dict:
    """parsing unrendered template into a dictionary"""
    return yaml.safe_load(self.unparsed_unrendered_template)
var source_type : str

determine the type of the source, either url, socrata or script

Expand source code
@property
def source_type(self) -> str:
    """determine the type of the source, either url, socrata or script"""
    template = self.parsed_unrendered_template
    source = template["dataset"]["source"]
    return list(source.keys())[0]
var unparsed_unrendered_template : str

importing the yml file into a string

Expand source code
@property
def unparsed_unrendered_template(self) -> str:
    """importing the yml file into a string"""
    with open(self.path, "r") as f:
        return f.read()
var version_today : str

set today as the version name - for use with unspecified or invalid versions

Expand source code
@property
def version_today(self) -> str:
    """
    set today as the version name - for use with unspecified
    or invalid versions
    """
    return datetime.today().strftime("%Y%m%d")

Methods

def parsed_rendered_template(self, **kwargs) ‑> dict

render template, then parse into a dictionary

Expand source code
def parsed_rendered_template(self, **kwargs) -> dict:
    """render template, then parse into a dictionary"""
    template = Template(self.unparsed_unrendered_template)
    return yaml.safe_load(template.render(**kwargs))
def valid_version(self, version: str) ‑> bool

check that a version name is valid

Expand source code
def valid_version(self, version: str) -> bool:
    """check that a version name is valid"""
    return "{" not in version and "}" not in version
def version_socrata(self, uid: str) ‑> str

using the socrata API, collect the 'data last update' date

Expand source code
def version_socrata(self, uid: str) -> str:
    """using the socrata API, collect the 'data last update' date"""
    metadata = requests.get(
        f"https://data.cityofnewyork.us/api/views/{uid}.json"
    ).json()
    version = datetime.fromtimestamp(metadata["rowsUpdatedAt"]).strftime("%Y%m%d")
    return version
class Dumper (stream, default_style=None, default_flow_style=False, canonical=None, indent=None, width=None, allow_unicode=None, line_break=None, encoding=None, explicit_start=None, explicit_end=None, version=None, tags=None, sort_keys=True)
Expand source code
class Dumper(yaml.Dumper):
    def increase_indent(self, flow=False, indentless=False):
        return super(Dumper, self).increase_indent(flow, False)

Ancestors

  • yaml.dumper.Dumper
  • yaml.emitter.Emitter
  • yaml.serializer.Serializer
  • yaml.representer.Representer
  • yaml.representer.SafeRepresenter
  • yaml.representer.BaseRepresenter
  • yaml.resolver.Resolver
  • yaml.resolver.BaseResolver

Methods

def increase_indent(self, flow=False, indentless=False)
Expand source code
def increase_indent(self, flow=False, indentless=False):
    return super(Dumper, self).increase_indent(flow, False)