Module library.script.doe_pepmeetingurls

Expand source code
import json
import os
import re
import urllib
from itertools import groupby
from pathlib import Path

import dateutil.parser as dparser
import pandas as pd
import requests
from bs4 import BeautifulSoup

from . import df_to_tempfile


def extract_date(x):
    try:
        dt = dparser.parse(x, fuzzy=True)
        return dt.date()
    except:
        pass


def get_date(url):
    url = url.split("&")[0].split("Public Notice")[0]
    paths = Path(url).parts
    lst = list(filter(lambda x: x is not None, map(extract_date, paths)))
    return lst[0].isoformat()


def get_school_year(url):
    url = url.split("&")[0]
    p = Path(url).parts
    school_year = str(p[11])
    return school_year


class Scriptor:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

    def ingest(self) -> pd.DataFrame:
        base_url = "https://www.schools.nyc.gov"
        url = (
            "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy"
        )
        url1 = "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy/pep-meetings-archive"
        html_doc = requests.get(url).content
        soup = BeautifulSoup(html_doc, "html.parser")

        html_doc1 = requests.get(url1).content
        soup1 = BeautifulSoup(html_doc1, "html.parser")

        proposals = []

        # get all proposal urls
        for a in soup.find_all("a", href=True):
            url = a["href"]
            if "sharepoint" in url:
                try:
                    r = requests.get(url)
                    readable_url = urllib.parse.unquote(r.url)
                    if "Contracts" in readable_url:
                        pass
                    else:
                        date = get_date(readable_url)
                        school_year = get_school_year(readable_url)
                        proposals.append(
                            dict(
                                url=a["href"],
                                school_year=school_year,
                                readable_url=readable_url,
                                date=date,
                            )
                        )
                except:
                    pass
            else:
                continue

        for a in soup1.find_all("a", href=True):
            url = a["href"]
            if "sharepoint" in url:
                try:
                    r = requests.get(url)
                    readable_url = urllib.parse.unquote(r.url)
                    if "Contracts" in readable_url:
                        pass
                    else:
                        try:
                            date = get_date(readable_url)
                            school_year = get_school_year(readable_url)
                            proposals.append(
                                dict(
                                    url=a["href"],
                                    school_year=school_year,
                                    readable_url=readable_url,
                                    date=date,
                                )
                            )
                        except:
                            school_year = get_school_year(readable_url)
                            proposals.append(
                                dict(
                                    url=a["href"],
                                    readable_url=readable_url,
                                    school_year=school_year,
                                    date="",
                                )
                            )
                except:
                    pass
            else:
                continue

        proposals1 = []
        for k, g in groupby(proposals, lambda x: x["date"] + x["school_year"]):
            g1 = min(g, key=lambda x: len(x["readable_url"]))
            proposals1.append(g1)
        df = pd.DataFrame(proposals1)
        return df

    def runner(self) -> str:
        df = self.ingest()
        local_path = df_to_tempfile(df)
        return local_path

Functions

def extract_date(x)
Expand source code
def extract_date(x):
    try:
        dt = dparser.parse(x, fuzzy=True)
        return dt.date()
    except:
        pass
def get_date(url)
Expand source code
def get_date(url):
    url = url.split("&")[0].split("Public Notice")[0]
    paths = Path(url).parts
    lst = list(filter(lambda x: x is not None, map(extract_date, paths)))
    return lst[0].isoformat()
def get_school_year(url)
Expand source code
def get_school_year(url):
    url = url.split("&")[0]
    p = Path(url).parts
    school_year = str(p[11])
    return school_year

Classes

class Scriptor (**kwargs)
Expand source code
class Scriptor:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

    def ingest(self) -> pd.DataFrame:
        base_url = "https://www.schools.nyc.gov"
        url = (
            "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy"
        )
        url1 = "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy/pep-meetings-archive"
        html_doc = requests.get(url).content
        soup = BeautifulSoup(html_doc, "html.parser")

        html_doc1 = requests.get(url1).content
        soup1 = BeautifulSoup(html_doc1, "html.parser")

        proposals = []

        # get all proposal urls
        for a in soup.find_all("a", href=True):
            url = a["href"]
            if "sharepoint" in url:
                try:
                    r = requests.get(url)
                    readable_url = urllib.parse.unquote(r.url)
                    if "Contracts" in readable_url:
                        pass
                    else:
                        date = get_date(readable_url)
                        school_year = get_school_year(readable_url)
                        proposals.append(
                            dict(
                                url=a["href"],
                                school_year=school_year,
                                readable_url=readable_url,
                                date=date,
                            )
                        )
                except:
                    pass
            else:
                continue

        for a in soup1.find_all("a", href=True):
            url = a["href"]
            if "sharepoint" in url:
                try:
                    r = requests.get(url)
                    readable_url = urllib.parse.unquote(r.url)
                    if "Contracts" in readable_url:
                        pass
                    else:
                        try:
                            date = get_date(readable_url)
                            school_year = get_school_year(readable_url)
                            proposals.append(
                                dict(
                                    url=a["href"],
                                    school_year=school_year,
                                    readable_url=readable_url,
                                    date=date,
                                )
                            )
                        except:
                            school_year = get_school_year(readable_url)
                            proposals.append(
                                dict(
                                    url=a["href"],
                                    readable_url=readable_url,
                                    school_year=school_year,
                                    date="",
                                )
                            )
                except:
                    pass
            else:
                continue

        proposals1 = []
        for k, g in groupby(proposals, lambda x: x["date"] + x["school_year"]):
            g1 = min(g, key=lambda x: len(x["readable_url"]))
            proposals1.append(g1)
        df = pd.DataFrame(proposals1)
        return df

    def runner(self) -> str:
        df = self.ingest()
        local_path = df_to_tempfile(df)
        return local_path

Methods

def ingest(self) ‑> pandas.core.frame.DataFrame
Expand source code
def ingest(self) -> pd.DataFrame:
    base_url = "https://www.schools.nyc.gov"
    url = (
        "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy"
    )
    url1 = "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy/pep-meetings-archive"
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, "html.parser")

    html_doc1 = requests.get(url1).content
    soup1 = BeautifulSoup(html_doc1, "html.parser")

    proposals = []

    # get all proposal urls
    for a in soup.find_all("a", href=True):
        url = a["href"]
        if "sharepoint" in url:
            try:
                r = requests.get(url)
                readable_url = urllib.parse.unquote(r.url)
                if "Contracts" in readable_url:
                    pass
                else:
                    date = get_date(readable_url)
                    school_year = get_school_year(readable_url)
                    proposals.append(
                        dict(
                            url=a["href"],
                            school_year=school_year,
                            readable_url=readable_url,
                            date=date,
                        )
                    )
            except:
                pass
        else:
            continue

    for a in soup1.find_all("a", href=True):
        url = a["href"]
        if "sharepoint" in url:
            try:
                r = requests.get(url)
                readable_url = urllib.parse.unquote(r.url)
                if "Contracts" in readable_url:
                    pass
                else:
                    try:
                        date = get_date(readable_url)
                        school_year = get_school_year(readable_url)
                        proposals.append(
                            dict(
                                url=a["href"],
                                school_year=school_year,
                                readable_url=readable_url,
                                date=date,
                            )
                        )
                    except:
                        school_year = get_school_year(readable_url)
                        proposals.append(
                            dict(
                                url=a["href"],
                                readable_url=readable_url,
                                school_year=school_year,
                                date="",
                            )
                        )
            except:
                pass
        else:
            continue

    proposals1 = []
    for k, g in groupby(proposals, lambda x: x["date"] + x["school_year"]):
        g1 = min(g, key=lambda x: len(x["readable_url"]))
        proposals1.append(g1)
    df = pd.DataFrame(proposals1)
    return df
def runner(self) ‑> str
Expand source code
def runner(self) -> str:
    df = self.ingest()
    local_path = df_to_tempfile(df)
    return local_path