Module library.script.doe_pepmeetingurls
Expand source code
import json
import os
import re
import urllib
from itertools import groupby
from pathlib import Path
import dateutil.parser as dparser
import pandas as pd
import requests
from bs4 import BeautifulSoup
from . import df_to_tempfile
def extract_date(x):
try:
dt = dparser.parse(x, fuzzy=True)
return dt.date()
except:
pass
def get_date(url):
url = url.split("&")[0].split("Public Notice")[0]
paths = Path(url).parts
lst = list(filter(lambda x: x is not None, map(extract_date, paths)))
return lst[0].isoformat()
def get_school_year(url):
url = url.split("&")[0]
p = Path(url).parts
school_year = str(p[11])
return school_year
class Scriptor:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
def ingest(self) -> pd.DataFrame:
base_url = "https://www.schools.nyc.gov"
url = (
"https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy"
)
url1 = "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy/pep-meetings-archive"
html_doc = requests.get(url).content
soup = BeautifulSoup(html_doc, "html.parser")
html_doc1 = requests.get(url1).content
soup1 = BeautifulSoup(html_doc1, "html.parser")
proposals = []
# get all proposal urls
for a in soup.find_all("a", href=True):
url = a["href"]
if "sharepoint" in url:
try:
r = requests.get(url)
readable_url = urllib.parse.unquote(r.url)
if "Contracts" in readable_url:
pass
else:
date = get_date(readable_url)
school_year = get_school_year(readable_url)
proposals.append(
dict(
url=a["href"],
school_year=school_year,
readable_url=readable_url,
date=date,
)
)
except:
pass
else:
continue
for a in soup1.find_all("a", href=True):
url = a["href"]
if "sharepoint" in url:
try:
r = requests.get(url)
readable_url = urllib.parse.unquote(r.url)
if "Contracts" in readable_url:
pass
else:
try:
date = get_date(readable_url)
school_year = get_school_year(readable_url)
proposals.append(
dict(
url=a["href"],
school_year=school_year,
readable_url=readable_url,
date=date,
)
)
except:
school_year = get_school_year(readable_url)
proposals.append(
dict(
url=a["href"],
readable_url=readable_url,
school_year=school_year,
date="",
)
)
except:
pass
else:
continue
proposals1 = []
for k, g in groupby(proposals, lambda x: x["date"] + x["school_year"]):
g1 = min(g, key=lambda x: len(x["readable_url"]))
proposals1.append(g1)
df = pd.DataFrame(proposals1)
return df
def runner(self) -> str:
df = self.ingest()
local_path = df_to_tempfile(df)
return local_path
Functions
def extract_date(x)
-
Expand source code
def extract_date(x): try: dt = dparser.parse(x, fuzzy=True) return dt.date() except: pass
def get_date(url)
-
Expand source code
def get_date(url): url = url.split("&")[0].split("Public Notice")[0] paths = Path(url).parts lst = list(filter(lambda x: x is not None, map(extract_date, paths))) return lst[0].isoformat()
def get_school_year(url)
-
Expand source code
def get_school_year(url): url = url.split("&")[0] p = Path(url).parts school_year = str(p[11]) return school_year
Classes
class Scriptor (**kwargs)
-
Expand source code
class Scriptor: def __init__(self, **kwargs): self.__dict__.update(kwargs) def ingest(self) -> pd.DataFrame: base_url = "https://www.schools.nyc.gov" url = ( "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy" ) url1 = "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy/pep-meetings-archive" html_doc = requests.get(url).content soup = BeautifulSoup(html_doc, "html.parser") html_doc1 = requests.get(url1).content soup1 = BeautifulSoup(html_doc1, "html.parser") proposals = [] # get all proposal urls for a in soup.find_all("a", href=True): url = a["href"] if "sharepoint" in url: try: r = requests.get(url) readable_url = urllib.parse.unquote(r.url) if "Contracts" in readable_url: pass else: date = get_date(readable_url) school_year = get_school_year(readable_url) proposals.append( dict( url=a["href"], school_year=school_year, readable_url=readable_url, date=date, ) ) except: pass else: continue for a in soup1.find_all("a", href=True): url = a["href"] if "sharepoint" in url: try: r = requests.get(url) readable_url = urllib.parse.unquote(r.url) if "Contracts" in readable_url: pass else: try: date = get_date(readable_url) school_year = get_school_year(readable_url) proposals.append( dict( url=a["href"], school_year=school_year, readable_url=readable_url, date=date, ) ) except: school_year = get_school_year(readable_url) proposals.append( dict( url=a["href"], readable_url=readable_url, school_year=school_year, date="", ) ) except: pass else: continue proposals1 = [] for k, g in groupby(proposals, lambda x: x["date"] + x["school_year"]): g1 = min(g, key=lambda x: len(x["readable_url"])) proposals1.append(g1) df = pd.DataFrame(proposals1) return df def runner(self) -> str: df = self.ingest() local_path = df_to_tempfile(df) return local_path
Methods
def ingest(self) ‑> pandas.core.frame.DataFrame
-
Expand source code
def ingest(self) -> pd.DataFrame: base_url = "https://www.schools.nyc.gov" url = ( "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy" ) url1 = "https://www.schools.nyc.gov/about-us/leadership/panel-for-education-policy/pep-meetings-archive" html_doc = requests.get(url).content soup = BeautifulSoup(html_doc, "html.parser") html_doc1 = requests.get(url1).content soup1 = BeautifulSoup(html_doc1, "html.parser") proposals = [] # get all proposal urls for a in soup.find_all("a", href=True): url = a["href"] if "sharepoint" in url: try: r = requests.get(url) readable_url = urllib.parse.unquote(r.url) if "Contracts" in readable_url: pass else: date = get_date(readable_url) school_year = get_school_year(readable_url) proposals.append( dict( url=a["href"], school_year=school_year, readable_url=readable_url, date=date, ) ) except: pass else: continue for a in soup1.find_all("a", href=True): url = a["href"] if "sharepoint" in url: try: r = requests.get(url) readable_url = urllib.parse.unquote(r.url) if "Contracts" in readable_url: pass else: try: date = get_date(readable_url) school_year = get_school_year(readable_url) proposals.append( dict( url=a["href"], school_year=school_year, readable_url=readable_url, date=date, ) ) except: school_year = get_school_year(readable_url) proposals.append( dict( url=a["href"], readable_url=readable_url, school_year=school_year, date="", ) ) except: pass else: continue proposals1 = [] for k, g in groupby(proposals, lambda x: x["date"] + x["school_year"]): g1 = min(g, key=lambda x: len(x["readable_url"])) proposals1.append(g1) df = pd.DataFrame(proposals1) return df
def runner(self) ‑> str
-
Expand source code
def runner(self) -> str: df = self.ingest() local_path = df_to_tempfile(df) return local_path