Data Cleaning

The purpose of this notebook is to clean the data generated in notebook “0. Data Collection”. I will clean each source and generate multiple dataframes to later model language.

1. Environment Creation

1.1 Library Import

''' OS MANAGEMENT '''
import os

''' DATA MANAGEMENT '''
import pandas as pd
import regex as re

''' DATA STRUCTURING '''
import ast 

''' TEXT PROCESSING '''
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

from nltk.stem import WordNetLemmatizer 
lem = WordNetLemmatizer() 



''' DATA VISUALIZATION '''
import seaborn as sb
from wordcloud import WordCloud
import matplotlib.pyplot as plt

1.2 Data Import

''' NEWS ARTICLE IMPORT'''
republican_news = pd.read_csv("NEWSAPI - republican climate articles raw.csv")
democrat_news = pd.read_csv("NEWSAPI - democrat climate articles raw.csv")

republican_news.drop(columns='Unnamed: 0',inplace=True)
democrat_news.drop(columns='Unnamed: 0',inplace=True)

''' EPA BILL IMPORT '''
bills = pd.read_csv(r"Bill Information Full FINAL.csv")
bills.drop(columns='Unnamed: 0',inplace=True)

''' PARTY PLATFORM IMPORT '''
with open("democrat_party_platform.txt", "r") as file:
    democrat_pdf = file.read()
    
    
with open("republican_party_platform.txt",'r') as f:
    republican_pdf = f.read()

1.3 Function Definition

''' 🫧🧼 | now lets create a cleaning function '''

def text_cleaner(text):
    try:
        scrubbed_text1 = re.sub('\d',' ',text)
        scrubbed_text2 = re.findall('\w+',scrubbed_text1)
        scrubbed_text3 = ' '.join(scrubbed_text2)
        scrubbed_text = scrubbed_text3.lower()
        clean_text = scrubbed_text.strip(" ")
        
        return(clean_text)
    
    except:
        return(text)
''' Code Source: Gates Bolton Analytics'''

def stemmer(string):
    try:
        words = re.sub(r"[^A-Za-z\-]", " ", string).lower().split()
        words = [ps.stem(word) for word in words]
        return words
    except:
        return ("")

def lemmer(string):
    try:
        words = re.sub(r"[^A-Za-z\-]", " ", string).lower().split()
        words = [lem.lemmatize(word) for word in words]
        return words
    except:
        return ("")
def count_vectorizer_creation(max_features,content,labels,label_colname):
    ''' COUNT VECTORIZER INSTANTIATION'''

    ## Instantiating the model, the filename parameter will take the list of file names
    ## filter for English stopwords, and take a max feature of 50
    count_vec = CountVectorizer(input='content',  stop_words='english', max_features=max_features, min_df=2,max_df=700)

    ## Fitting the model to the corpus
    model = count_vec.fit_transform(content)
    
    ''' EXTRACTING FEATURES'''

    ## Using get feature names out to name the columns
    columns = count_vec.get_feature_names_out()

    ## Creating the dataframe 
    vect_dataframe = pd.DataFrame(model.toarray(),columns=columns)
    
    ''' ADDING LABELS '''
    if type(labels) == list:
        vect_dataframe.insert(0,label_colname,labels)
        
    else:
        vect_dataframe = pd.concat([labels,vect_dataframe],axis=1)
        
    return (vect_dataframe)
def tfidf_vectorizer_creation(max_features,content,labels,label_colname):
    ''' INSTANTIATING THE MODEL '''

    ## Input is set to content here because I will be passing in a list of the descriptions from the CSV File
    tfidf_vec = TfidfVectorizer(input='content',stop_words='english',max_features=max_features,min_df=2,max_df=700)

    ''' TRAINING THE MODEL '''
    tfidf_model = tfidf_vec.fit_transform(content)

    ''' STRUCTURING THE DATAFRAME '''
    tfidf_columns = tfidf_vec.get_feature_names_out()
    tfidf_df =pd.DataFrame(tfidf_model.toarray(),columns=tfidf_columns)
        
    ''' ADDING LABELS '''
    if type(labels) == list:
        tfidf_df.insert(0,label_colname,labels)
        
    else:
        tfidf_df = pd.concat([labels,tfidf_df],axis=1)
        
    return (tfidf_df)

2. News Article Cleaning

republican_news.head(1)
source author title description url urlToImage publishedAt content
0 {'id': 'the-verge', 'name': 'The Verge'} Nilay Patel Trump’s first 100 days: all the news impacting... President Donald Trump is taking on TikTok, el... https://www.theverge.com/24348851/donald-trump... https://cdn.vox-cdn.com/thumbor/Nwo4_i4giY8lRM... 2025-01-22T14:30:00Z Filed under:\r\nByLauren Feiner, a senior poli...
republican_label = []
for i in range(0,len(republican_news)):
    republican_label.append('Republican')
    
democrat_label = []
for i in range(0,len(democrat_news)):
    democrat_label.append('Democrat')
    
republican_news['Party'] = republican_label
democrat_news['Party'] = democrat_label
news_data = pd.concat([republican_news,democrat_news])
news_data.reset_index(inplace=True)
news_data = news_data[['Party','source','title','description',]]
## Now fixing the source
def source_fixer(source):
    source_dict = ast.literal_eval(source)
    source_name = source_dict['name']
    return (source_name)
publisher = []
for article in range(0,len(news_data)):
    source_raw = news_data.at[article,'source']
    source_name = source_fixer(source_raw)
    publisher.append(source_name)
    
## Appending the column to track the source
news_data['publisher'] = publisher
news_data.drop(columns='source',inplace=True)
''' CLEANING THE TITLES '''
clean_titles = []
for article in range(0,len(news_data)):
    title_raw = news_data.at[article,'title']
    title = text_cleaner(title_raw)
    clean_titles.append(title)
    
## Appending the column to track the source
news_data['clean title'] = clean_titles

''' CLEANING THE DESCRIPTIONS '''
clean_descriptions = []
for article in range(0,len(news_data)):
    description_raw = news_data.at[article,'description']
    description = text_cleaner(description_raw)
    clean_descriptions.append(description)
    
## Appending the column to track the source
news_data['clean description'] = clean_descriptions

''' APPENDING THE COLUMNS FOR LARGER CONTEXT '''
news_data['title + description'] = news_data['clean title'] +" " + news_data['clean description']
news_data.head(3)
Party title description publisher clean title clean description title + description
0 Republican Trump’s first 100 days: all the news impacting... President Donald Trump is taking on TikTok, el... The Verge trump s first days all the news impacting the ... president donald trump is taking on tiktok ele... trump s first days all the news impacting the ...
1 Republican The Quiet Death of Biden’s Climate Corps—and W... Biden's green jobs program was never what it s... Gizmodo.com the quiet death of biden s climate corps and w... biden s green jobs program was never what it s... the quiet death of biden s climate corps and w...
2 Republican The peanut farmer who rose to US president and... The US president struggled in the White House ... BBC News the peanut farmer who rose to us president and... the us president struggled in the white house ... the peanut farmer who rose to us president and...
news_data.dropna(inplace=True)
news_data.reset_index(inplace=True)
news_data.to_csv("News Data Cleaned.csv")
print (news_data.at[169,'title'])
print (news_data.at[169,'description'])

print ("\nCleaned and Combined Title and Description----------------")
print (news_data.at[169,'title + description'])
The Trump-Newsom Fight Over an Alleged 'Water Restoration Declaration,' Explained
Trump claimed Newsom's refusal to sign the document led to a water shortage during the Los Angeles fires. But there's more to the story.

Cleaned and Combined Title and Description----------------
the trump newsom fight over an alleged water restoration declaration explained trump claimed newsom s refusal to sign the document led to a water shortage during the los angeles fires but there s more to the story
publisher_counts = Counter(news_data['publisher'].to_list())
count = pd.DataFrame.from_dict(publisher_counts,orient='index',columns=['Count'])
print (f"There are {len(count)} unique sources in the dataset.")
There are 166 unique sources in the dataset.
labels = news_data[['Party','publisher']]

2.2.1 Stemming

stemmed_texts = []
for article in range(0,len(news_data)):
    text = news_data.at[article,'title + description']
    stemmed = stemmer(text)
    st = ' '.join(stemmed)
    stemmed_texts.append(st)
stemmed_texts[0]
'trump s first day all the news impact the tech industri presid donald trump is take on tiktok electr vehicl polici and ai in hi first day in offic thi time around he ha the back of mani tech billionair'
news_vec_stemmed = count_vectorizer_creation(100000,stemmed_texts,labels,['Party','publisher'])
news_vec_stemmed
Party publisher aapi abandon abc abil abl abolish abort abov ... yekel york young youth zealot zeldin zer zero zone zuckerberg
0 Republican The Verge 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Republican Gizmodo.com 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
816 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
817 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
818 Democrat The Times of India 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
819 Democrat The Times of India NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
699 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

820 rows × 2122 columns

news_vec_stemmed.to_csv("News Articles Stemmed- Count Vectorizer.csv")
news_tfidf_stemmed = tfidf_vectorizer_creation(100000,stemmed_texts,labels,['Party','publisher'])
news_tfidf_stemmed
Party publisher aapi abandon abc abil abl abolish abort abov ... yekel york young youth zealot zeldin zer zero zone zuckerberg
0 Republican The Verge 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0
1 Republican Gizmodo.com 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0
2 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0
3 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0
4 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
816 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0
817 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.244297 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0
818 Democrat The Times of India 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.20093 0.0
819 Democrat The Times of India NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
699 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0

820 rows × 2122 columns

news_tfidf_stemmed.to_csv("News Articles Stemmed- TFIDF.csv")

2.2.2 Lemmatization

lemmed_texts = []
for article in range(0,len(news_data)):
    text = news_data.at[article,'title + description']
    lemmed = lemmer(text)
    lt = ' '.join(lemmed)
    lemmed_texts.append(lt)
news_vec_lemmed = count_vectorizer_creation(10000,lemmed_texts,labels,['Party','publisher'])
news_vec_lemmed
Party publisher aapi abandon abandoned abc ability able abolish abortion ... yes york young youth zealot zeldin zero zers zone zuckerberg
0 Republican The Verge 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Republican Gizmodo.com 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
816 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
817 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
818 Democrat The Times of India 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
819 Democrat The Times of India NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
699 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

820 rows × 2360 columns

news_vec_lemmed.to_csv("News Articles Lemmed- Count Vectorizer.csv")
news_tfidf_lemmed = tfidf_vectorizer_creation(10000,lemmed_texts,labels,['Party','publisher'])
news_tfidf_lemmed
Party publisher aapi abandon abandoned abc ability able abolish abortion ... yes york young youth zealot zeldin zero zers zone zuckerberg
0 Republican The Verge 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
1 Republican Gizmodo.com 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
2 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
3 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
4 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
816 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
817 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
818 Democrat The Times of India 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.201006 0.0
819 Democrat The Times of India NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
699 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0

820 rows × 2360 columns

news_tfidf_lemmed.to_csv("News Articles Lemmed- TFIDF.csv")

2.2.3 CountVectorizer

texts = news_data['title + description'].to_list()
news_vec = count_vectorizer_creation(10000,texts,labels,['Party','publisher'])
news_vec.to_csv("News Articles - Count Vectorizer.csv")

2.2.4 TF-IDF Vectorizer

news_tfidf = tfidf_vectorizer_creation(10000,texts,labels,['Party','publisher'])
news_tfidf.to_csv("News Articles - TF-IDF.csv")

3. EPA Bill Cleaning

bills.head(1)
API URL Congress Number Bill Type Bill Number Legislation Number URL_y Congress Title Sponsor Date of Introduction ... Number of Cosponsors Amends Bill Date Offered Date Submitted Date Proposed Amends Amendment Sponser Affiliation Sponser State Bill Title (XML) Bill Text
0 https://www.congress.gov/119/bills/hr375/BILLS... 119 hr 375 H.R. 375 https://www.congress.gov/bill/119th-congress/h... 119th Congress (2025-2026) Continued Rapid Ohia Death Response Act of 2025 Tokuda, Jill N. [Rep.-D-HI-2] (Introduced 01/1... 1/13/2025 ... 1 NaN NaN NaN NaN NaN D HI <dc:title>119 HR 375 : Continued Rapid Ohia De... \n\n119 HR 375 : Continued Rapid Ohia Death Re...

1 rows × 23 columns

''' CLEANING THE BILLS '''
clean_bills= []
for bill in range(0,len(bills)):
    bill_text_raw = bills.at[bill,'Bill Text']
    clean_bill = text_cleaner(bill_text_raw)
    clean_bills.append(clean_bill)
    
## Appending the column to track the source
bills['Bill Text Clean'] = clean_bills
texts = clean_bills
labels = bills[['Bill Type','Sponser Affiliation','Sponser State','Committees']]

3.2.1 Stemming

stemmed_texts = []
for article in range(0,len(bills)):
    text = bills.at[article,'Bill Text Clean']
    stemmed = stemmer(text)
    st = ' '.join(stemmed)
    stemmed_texts.append(st)
stemmed_texts[0]
'hr continu rapid ohia death respons act of u s hous of repres text xml en pursuant to titl section of the unit state code thi file is not subject to copyright protect and is in the public domain iib th congress st sessionh r in the senat of the unit statesjanuari receiv read twice and refer to the committe on agricultur nutrit and forestryan actto requir the secretari of the interior to partner and collabor with the secretari of agricultur and the state of hawaii to address rapid ohia death and for other purpos short titlethi act may be cite as the continu rapid ohia death respons act of definitionsin thi act rapid ohia deathth term rapid ohia death mean the diseas caus by the fungal pathogen known as ceratocysti fimbriata that affect the tree of the speci metrosidero polymorpha stateth term state mean the state of hawaii collaborationth secretari of the interior shall partner and collabor with the secretari of agricultur and the state to address rapid ohia death sustain effort a transmissionth secretari of the interior act through the director of the unit state geolog survey and the chief of the forest servic act through the forest servic institut of pacif island forestri shall continu to conduct research on rapid ohia death vector and transmiss b ungul managementth secretari of the interior act through the director of the unit state fish and wildlif servic shall continu to partner with the secretari of agricultur the state and with local stakehold to manag ungul in rapid ohia death control area on feder state and privat land with the consent of privat landown c restor and researchth secretari of agricultur act through the chief of the forest servic shall continu to provid financi assist includ through agreement with the secretari of the interior a to prevent the spread of rapid ohia death and b to restor the nativ forest of the state and staff and necessari infrastructur fund to the institut of pacif island forestri to conduct research on rapid ohia death pass the hous of repres januari kevin f mccumber clerk'
bill_vec_stemmed = count_vectorizer_creation(100000,stemmed_texts,labels,['Bill Type','Sponser Affiliation','Sponser State','Committees'])
bill_vec_stemmed
Bill Type Sponser Affiliation Sponser State Committees aa aaa aarhu ab abandon abandonth ... zoe zone zonea zonesnotwithstand zoneth zoo zoolog zoonot zooplankton zquez
0 hr D HI House - Natural Resources, Agriculture | Senat... 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 hr R NY House - Agriculture 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 hr R TX House - Energy and Commerce 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 hr R NY House - Transportation and Infrastructure, Nat... 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 hr R OH House - Transportation and Infrastructure 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3256 hr D CA House - Transportation and Infrastructure 0 0 0 0 0 0 ... 0 6 0 0 0 0 0 0 0 0
3257 hr R CO House - Resources 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3258 hr D MI House - Energy and Commerce 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3259 s D NJ Senate - Environment and Public Works 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3260 hr R TX House - Energy and Commerce 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3261 rows × 15493 columns

bill_vec_stemmed.to_csv("Bills Stemmed- Count Vectorizer.csv")
bill_tfidf_stemmed = tfidf_vectorizer_creation(100000,stemmed_texts,labels,['Bill Type','Sponser Affiliation','Sponser State','Committees'])
bill_tfidf_stemmed
Bill Type Sponser Affiliation Sponser State Committees aa aaa aarhu ab abandon abandonth ... zoe zone zonea zonesnotwithstand zoneth zoo zoolog zoonot zooplankton zquez
0 hr D HI House - Natural Resources, Agriculture | Senat... 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 hr R NY House - Agriculture 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 hr R TX House - Energy and Commerce 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 hr R NY House - Transportation and Infrastructure, Nat... 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 hr R OH House - Transportation and Infrastructure 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3256 hr D CA House - Transportation and Infrastructure 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.022374 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3257 hr R CO House - Resources 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3258 hr D MI House - Energy and Commerce 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3259 s D NJ Senate - Environment and Public Works 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3260 hr R TX House - Energy and Commerce 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3261 rows × 15493 columns

bill_tfidf_stemmed.to_csv("Bills Stemmed- TFIDF.csv")

3.2.2 Lemmatization

lemmed_texts = []
for article in range(0,len(bills)):
    text = bills.at[article,'Bill Text Clean']
    lemmed = lemmer(text)
    lt = ' '.join(lemmed)
    lemmed_texts.append(lt)
bill_vec_lemmed = count_vectorizer_creation(100000,stemmed_texts,labels,['Bill Type','Sponser Affiliation','Sponser State','Committees'])
bill_vec_lemmed.to_csv("Bills Lemmed- Count Vectorizer.csv")
bill_tfidf_lemmed = tfidf_vectorizer_creation(10000,lemmed_texts,labels,['Party','publisher'])
bill_tfidf_lemmed.to_csv("Bills Lemmed- TFIDF.csv")

3.2.3 CountVectorizer

bill_vec = count_vectorizer_creation(100000,texts,labels,['Bill Type','Sponser Affiliation','Sponser State','Committees'])
bill_vec.to_csv("Bills - Count Vectorizer.csv")

3.2.4 TF-IDF Vectorizer

bill_tfidf = tfidf_vectorizer_creation(100000,texts,labels,['Bill Type','Sponser Affiliation','Sponser State','Committees'])
bill_tfidf.to_csv("Bills - TF-IDF.csv")

3.3 Visualizing Before and After Bill Cleaning

raw_example = bills['Bill Text'].to_list()
raw_example_text = raw_example[200]
raw_example_string = ' '.join(raw_example)
clean_example_string = ' '.join(texts)
raw_example_text
'\n\n\n HR 2950 ENR: Coastal Habitat Conservation Act of 2023\nU.S. House of Representatives\n\ntext/xml\nEN\nPursuant to Title 17 Section 105 of the United States Code, this file is not subject to copyright protection and is in the public domain.\n\n\n\nIB\nOne Hundred Eighteenth Congress of the United States of AmericaAt the Second SessionBegun and held at the City of Washington on Wednesday, the third day of January, two thousand and twenty-four\nH. R. 2950\n\nAN ACT\nTo authorize the Secretary of the Interior, through the Coastal Program of the United States Fish and Wildlife Service, to work with willing partners and provide support to efforts to assess, protect, restore, and enhance important coastal landscapes that provide fish and wildlife habitat on which certain Federal trust species depend, and for other purposes.\n\n\n1.Short titleThis Act may be cited as the Coastal Habitat Conservation Act of 2023. 2.PurposeThe purpose of this Act is to legislatively authorize the Coastal Program of the Service in effect as of the date of the enactment of this Act to conduct collaborative landscape-level planning and on-the-ground coastal habitat assessment, coastal habitat protection, coastal habitat restoration, and coastal habitat enhancement projects in priority coastal landscapes to conserve and recover Federal trust species.\n3.DefinitionsIn this Act: (1)Coastal ecosystemThe term coastal ecosystem means a biological community of organisms interacting with each other and their habitats in a coastal landscape.\n(2)Coastal habitat assessmentThe term coastal habitat assessment means the process of evaluating the physical, chemical, and biological function of a coastal site to determine the value of the site to fish and wildlife. (3)Coastal habitat enhancementThe term coastal habitat enhancement means the manipulation of the physical, chemical, or biological characteristics of a coastal ecosystem to increase or decrease specific biological functions that make the ecosystem valuable to fish and wildlife.\n(4)Coastal habitat planningThe term coastal habitat planning means the process of developing a comprehensive plan that— (A)characterizes a coastal ecosystem;\n(B)sets protection, restoration, or enhancement goals and identifies the priorities of those goals; (C)describes conservation strategies and methodologies;\n(D)establishes a timetable for implementation of the plan; and (E)identifies roles of participants and stakeholders.\n(5)Coastal habitat protection\n(A)In generalThe term coastal habitat protection means a long-term action to safeguard habitat of value to fish and wildlife in a coastal ecosystem. (B)InclusionThe term coastal habitat protection includes activities to support establishment of a conservation easement or fee title acquisition by Federal and non-Federal partners.\n(6)Coastal habitat restorationThe term coastal habitat restoration means the manipulation of the physical, chemical, or biological characteristics of a coastal ecosystem with the goal of returning, to the maximum extent practicable, the full natural biological functions to lost or degraded native habitat. (7)Coastal landscapeThe term coastal landscape means a portion of a coastal ecosystem within or adjacent to a coastal State that contains various habitat types, including—\n(A)a fresh or saltwater wetland in a coastal watershed; (B)a coastal river, stream, or waterway;\n(C)a coastal bay or estuary; (D)a seagrass bed, reef, or other nearshore marine habitat;\n(E)a beach or dune system; (F)a mangrove forest; and\n(G)an associated coastal upland.  (8)Coastal StateThe term coastal State means—\n(A)a State in, or bordering on, the Atlantic, Pacific, or Arctic Ocean, the Gulf of Mexico, the Long Island Sound, or 1 or more of the Great Lakes; (B)the District of Columbia;\n(C)the Commonwealth of Puerto Rico; (D)Guam;\n(E)American Samoa; (F)the Commonwealth of the Northern Mariana Islands;\n(G)the Federated States of Micronesia; (H)the Republic of the Marshall Islands;\n(I)the Republic of Palau; and (J)the United States Virgin Islands.\n(9)Federal trust speciesThe term Federal trust species means migratory birds, threatened species or endangered species listed under the Endangered Species Act of 1973 (16 U.S.C. 1531 et seq.), interjurisdictional fish, and marine mammals for which the Secretary has management authority. (10)Financial assistanceThe term financial assistance means Federal funding provided to Federal, State, local, or Tribal governments, nongovernmental institutions, nonprofit organizations, and private individuals and entities through a grant or cooperative agreement.\n(11)SecretaryThe term Secretary means the Secretary of the Interior. (12)ServiceThe term Service means the United States Fish and Wildlife Service.\n(13)Technical assistanceThe term technical assistance means a collaboration, facilitation, or consulting action relating to a coastal habitat planning, coastal habitat assessment, coastal habitat protection, coastal habitat restoration, or coastal habitat enhancement project or initiative in which the Service contributes scientific knowledge, skills, and expertise to the project or initiative. 4.Coastal programThe Secretary shall carry out the Coastal Program within the Service to—\n(1)identify the leading threats to priority coastal landscapes and conservation actions to address those threats in partnership with Federal, State, local, and Tribal governments, nongovernmental institutions, nonprofit organizations, and private individuals and entities;  (2)provide technical assistance and financial assistance through partnerships with Federal, State, local, and Tribal governments, nongovernmental institutions, nonprofit organizations, and private individuals and entities to conduct voluntary coastal habitat planning, coastal habitat assessment, coastal habitat protection, coastal habitat restoration, and coastal habitat enhancement projects on public land or private land;\n(3)ensure the health and resilience of coastal ecosystems through adaptive management procedures based on the best available science; (4)build the capacity of Federal, State, local, and Tribal governments, nongovernmental institutions, nonprofit organizations, and private individuals and entities to carry out environmental conservation and stewardship measures;\n(5)assist in the development and implementation of monitoring protocols to ensure the success of coastal ecosystem restoration and coastal ecosystem enhancement measures; and (6)collaborate and share information with partners and the public relating to best management practices for the conservation, restoration, and enhancement of coastal ecosystems.\n5.Reports\n(a)In generalNot later than 1 year after the date of the enactment of this Act, and annually thereafter, the Secretary, acting through the Director of the Service, shall submit to the Committees on Appropriations and Natural Resources of the House of Representatives and the Committees on Appropriations and Environment and Public Works of the Senate, and make available to the public on the website of the Service, a report on the Coastal Program carried out under this Act. (b)RequirementsEach report submitted under subsection (a) shall assess on regional and nationwide bases—\n(1)Coastal Program work on coastal ecosystems; (2)progress made by the Coastal Program toward identifying the leading threats to priority coastal landscapes and conservation actions to address those threats; and\n(3)prospects for, and success of, protecting, restoring, and enhancing coastal ecosystems. (c)InclusionsEach report submitted under subsection (a) shall include—\n(1)quantitative information on coastal landscapes protected, restored, or enhanced; (2)funds appropriated to the Coastal Program that have been expended or leveraged;\n(3)a description of adaptive management practices implemented; and (4)a description of emerging challenges or data gaps that hinder the ability of the Coastal Program to achieve the purpose of this Act.\n6.Authorization of appropriationsThere is authorized to be appropriated to carry out this Act $16,957,000 for each of fiscal years 2024 through 2028.  Speaker of the House of Representatives.Vice President of the United States and President of the Senate. '
title = bills.at[200,'Title']
clean_example_text = texts[200]
sb.set_style("white")
sb.set(font='Times New Roman', font_scale=1.2)

wc = WordCloud(width=600,height=300, background_color='white',colormap='gist_earth_r',max_words=1000)
wc.generate_from_text(raw_example_string)

## Plotting the cloud

plt.figure(figsize=(9,6),dpi=750)
plt.imshow(wc)
plt.axis('off')
plt.title("Raw Bill Text")
plt.savefig("Bill - Raw Text.png",dpi=1000);

png

sb.set_style("white")
sb.set(font='Times New Roman', font_scale=1.2)

wc = WordCloud(width=600,height=300, background_color='white',colormap='gist_earth_r',max_words=1000)
wc.generate_from_text(clean_example_string)

## Plotting the cloud

plt.figure(figsize=(9,6),dpi=750)
plt.imshow(wc)
plt.axis('off')
plt.title("Clean Bill Text")
plt.savefig("Bill - Clean Text.png",dpi=1000);

png


4. Party Platform Cleaning

democrat_pdf[0:100]
' Democratic\nNational\nConvention \nLand\nAcknowledgement\nThe\nDemocratic\nNational\nCommittee\nwishes\nto\nac'
republican_pdf[0:100]
'4343RDRD REPUBLICAN NATIONAL CONVENTION REPUBLICAN NATIONAL CONVENTION\nPLATFORMTHE 2024 REPUBLICAN\nM'

4.1 Visualizing Raw Data

sb.set_style("white")
sb.set(font='Times New Roman', font_scale=1.2)

wc = WordCloud(width=600,height=300, background_color='white',colormap='Blues',max_words=250)
wc.generate_from_text(democrat_pdf)

## Plotting the cloud

plt.figure(figsize=(9,6),dpi=750)
plt.imshow(wc)
plt.axis('off')
plt.title("DNC Party Platform - 2024\nRaw PDF Text")
plt.savefig("DNC Party Platform - Raw Text",dpi=1000);

png

sb.set_style("white")
sb.set(font='Times New Roman', font_scale=1.2)

wc = WordCloud(width=600,height=300, background_color='white',colormap='OrRd',max_words=250)
wc.generate_from_text(republican_pdf)

## Plotting the cloud

plt.figure(figsize=(9,6),dpi=750)
plt.imshow(wc)
plt.axis('off')
plt.title("GOP Party Platform - 2024\nRaw PDF Text")
plt.savefig("GOP Party Platform - Raw Text",dpi=1000);

png

4.2 DF Creation

''' CLEANING THE TEXT -- DEMOCRAT '''

## First, doing some basic cleaning for the text
democrat_text = re.sub('\W',' ',democrat_pdf)

## And Now Stripping Any White Space
democrat_string = democrat_text.strip(" ")

## And Now Splitting!
democrat_text_list = democrat_string.split(" ")
democrat_text_list[0:10]
['Democratic',
 'National',
 'Convention',
 '',
 'Land',
 'Acknowledgement',
 'The',
 'Democratic',
 'National',
 'Committee']
## It looks like there is a few empty characters in here, so now lets clean that up as well and lower the full text,
## and removing numbers
dem_text_clean = []

for word in democrat_text_list:
    if len(word) > 1:
        if len(re.findall("\d",word)) < 1:
            dem_text_clean.append(word.lower())
dem_text_clean[0:10]
['democratic',
 'national',
 'convention',
 'land',
 'acknowledgement',
 'the',
 'democratic',
 'national',
 'committee',
 'wishes']
''' CLEANING THE TEXT -- REPUBLICAN '''

## First, doing some basic cleaning for the text
republican_text = re.sub('\W',' ',republican_pdf)

## And Now Stripping Any White Space
republican_string = republican_text.strip(" ")

## And Now Splitting!
republican_text_list = republican_string.split(" ")
republican_text_list[0:10]
['4343RDRD',
 'REPUBLICAN',
 'NATIONAL',
 'CONVENTION',
 'REPUBLICAN',
 'NATIONAL',
 'CONVENTION',
 'PLATFORMTHE',
 '2024',
 'REPUBLICAN']
## It looks like there is a few empty characters in here, so now lets clean that up as well and lower the full text,
## and removing numbers
rep_text_clean = []

for word in republican_text_list:
    if len(word) > 1:
        if len(re.findall("\d",word)) < 1:
            rep_text_clean.append(word.lower())
rep_text_clean[0:10]
['republican',
 'national',
 'convention',
 'republican',
 'national',
 'convention',
 'platformthe',
 'republican',
 'make',
 'america']
''' CREATING A DATAFRAME '''
rep_text_final = ' '.join(rep_text_clean)
dem_text_final = ' '.join(dem_text_clean)

party_platforms = pd.DataFrame(columns=['Party','Text'])
party_platforms['Party'] = ['Republican','Democrat']
party_platforms['Text'] = [rep_text_final,dem_text_final]
party_platforms
Party Text
0 Republican republican national convention republican nati...
1 Democrat democratic national convention land acknowledg...

4.2.1 Stemming

rep_stemmed = stemmer(rep_text_final)
dem_stemmed = stemmer(dem_text_final)

rep_stem_text = ' '.join(rep_stemmed)
dem_stem_text = ' '.join(rep_stemmed)
platform_vec_stemmed = count_vectorizer_creation(10000,[rep_stem_text,dem_stem_text],['Republican','Democrat'],'Party')
platform_vec_stemmed
Party abernathi abil abl abort absolut abund access accomplish accord ... worst wrongdoer www wyom year yob york young zack zoraida
0 Republican 1 1 1 1 1 3 5 1 2 ... 1 1 1 1 7 1 1 7 1 1
1 Democrat 1 1 1 1 1 3 5 1 2 ... 1 1 1 1 7 1 1 7 1 1

2 rows × 1379 columns

platform_vec_stemmed.to_csv("Party Platform Stemmed- Count Vectorizer.csv")
platform_tfidf_stemmed = tfidf_vectorizer_creation(10000,[rep_stem_text,dem_stem_text],['Republican','Democrat'],'Party')
platform_tfidf_stemmed
Party abernathi abil abl abort absolut abund access accomplish accord ... worst wrongdoer www wyom year yob york young zack zoraida
0 Republican 0.004754 0.004754 0.004754 0.004754 0.004754 0.014261 0.023769 0.004754 0.009508 ... 0.004754 0.004754 0.004754 0.004754 0.033277 0.004754 0.004754 0.033277 0.004754 0.004754
1 Democrat 0.004754 0.004754 0.004754 0.004754 0.004754 0.014261 0.023769 0.004754 0.009508 ... 0.004754 0.004754 0.004754 0.004754 0.033277 0.004754 0.004754 0.033277 0.004754 0.004754

2 rows × 1379 columns

platform_tfidf_stemmed.to_csv("Party Platform Stemmed- TFIDF.csv")

4.2.2 Lemmatization

rep_lemmed = lemmer(rep_text_final)
dem_lemmed = lemmer(dem_text_final)

rep_lem_text = ' '.join(rep_lemmed)
dem_lem_text = ' '.join(dem_lemmed)
platform_vec_lemmed = count_vectorizer_creation(10000,[rep_lem_text,dem_lem_text],['Republican','Democrat'],'Party')
platform_vec_lemmed
Party ability able abortion access accessible according accountability accountable achieved ... won word work worker working world worship worst year young
0 Republican 1 1 1 4 1 1 1 4 1 ... 1 1 4 15 2 15 2 1 7 7
1 Democrat 7 13 13 72 15 1 6 14 1 ... 10 1 101 79 81 69 4 4 149 10

2 rows × 893 columns

platform_vec_lemmed.to_csv("Party Platform Lemmed- Count Vectorizer.csv")
platform_tfidf_lemmed = tfidf_vectorizer_creation(10000,[rep_lem_text,dem_lem_text],['Republican','Democrat'],'Party')
platform_tfidf_lemmed
Party ability able abortion access accessible according accountability accountable achieved ... won word work worker working world worship worst year young
0 Republican 0.005157 0.005157 0.005157 0.020627 0.005157 0.005157 0.005157 0.020627 0.005157 ... 0.005157 0.005157 0.020627 0.077351 0.010314 0.077351 0.010314 0.005157 0.036097 0.036097
1 Democrat 0.006634 0.012320 0.012320 0.068233 0.014215 0.000948 0.005686 0.013268 0.000948 ... 0.009477 0.000948 0.095716 0.074867 0.076762 0.065390 0.003791 0.003791 0.141205 0.009477

2 rows × 893 columns

platform_tfidf_lemmed.to_csv("Party Platform Lemmed- TFIDF.csv")

4.2.3 CountVectorizer

platform_vec = count_vectorizer_creation(10000,[rep_text_final,dem_text_final],['Republican','Democrat'],'Party')
platform_vec
Party ability able abortion access accessible according accountability accountable achieved ... work worker workers working world worship worst year years young
0 Republican 1 1 1 4 1 1 1 4 1 ... 4 1 14 2 15 2 1 2 5 7
1 Democrat 7 13 13 72 15 1 6 14 1 ... 100 4 75 81 69 4 4 80 69 10

2 rows × 946 columns

platform_vec.to_csv("Party Platform - Count Vectorizer.csv")

4.2.4 TF-IDF Vectorizer

platform_tfidf = tfidf_vectorizer_creation(10000,[rep_text_final,dem_text_final],['Republican','Democrat'],'Party')
platform_tfidf
Party ability able abortion access accessible according accountability accountable achieved ... work worker workers working world worship worst year years young
0 Republican 0.005705 0.005705 0.005705 0.022819 0.005705 0.005705 0.005705 0.022819 0.005705 ... 0.022819 0.005705 0.079866 0.011409 0.085571 0.011409 0.005705 0.011409 0.028524 0.039933
1 Democrat 0.007566 0.014051 0.014051 0.077823 0.016213 0.001081 0.006485 0.015132 0.001081 ... 0.108088 0.004324 0.081066 0.087551 0.074581 0.004324 0.004324 0.086470 0.074581 0.010809

2 rows × 946 columns

platform_tfidf.to_csv("Party Platform - TF-IDF.csv")

4.3 Visualizing the Clean Party Platforms

republican_platform_clean = platform_vec_lemmed[platform_vec_lemmed['Party'] == 'Republican']
freq = republican_platform_clean.transpose().to_dict()
freq = freq[0]
del freq['Party']
sb.set_style("white")
sb.set(font='Times New Roman', font_scale=1.2)

wc = WordCloud(width=600,height=300, background_color='white',colormap='OrRd',max_words=250)
wc.generate_from_frequencies(freq)

## Plotting the cloud

plt.figure(figsize=(9,6),dpi=750)
plt.imshow(wc)
plt.axis('off')
plt.title("GOP Party Platform - 2024\nClean PDF Text")
plt.savefig("GOP Party Platform - Clean Text",dpi=1000);

png

democrat_platform_clean = platform_vec_lemmed[platform_vec_lemmed['Party'] == 'Democrat']
freq = democrat_platform_clean.transpose().to_dict()
freq = freq[1]
del freq['Party']
sb.set_style("white")
sb.set(font='Times New Roman', font_scale=1.2)

wc = WordCloud(width=600,height=300, background_color='white',colormap='Blues',max_words=250)
wc.generate_from_frequencies(freq)

## Plotting the cloud

plt.figure(figsize=(9,6),dpi=750)
plt.imshow(wc)
plt.axis('off')
plt.title("DNC Party Platform - 2024\nClean PDF Text")
plt.savefig("DNC Party Platform - Clean Text",dpi=1000);

png