Political Stances: Latent Dirichlet Allocation Code
5. Latent Dirichlet Allocation
1. Environment Creation:
1.1 Library Import
''' DATA MANAGEMENT '''
import pandas as pd
import regex as re
import numpy as np
import os
from gensim import corpora
from gensim.corpora import Dictionary
''' TEXT PROCESSING '''
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
''' DATA VISUALIZATION '''
import seaborn as sb
from wordcloud import WordCloud
import matplotlib.pyplot as plt
''' LDA VIS'''
import pyLDAvis
''' SANITY '''
from tqdm import tqdm
C:\Users\natal\miniconda3\lib\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
from pandas.core import (
1.2 Data Import
news_data = pd.read_csv(r"C:\Users\natal\OneDrive\university\info 5653\data\News Articles Lemmed- Count Vectorizer.csv")
news_data
Unnamed: 0 | Party | publisher | aapi | abandon | abandoned | abc | ability | able | abolish | ... | yes | york | young | youth | zealot | zeldin | zero | zers | zone | zuckerberg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Republican | The Verge | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 1 | Republican | Gizmodo.com | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 2 | Republican | BBC News | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 3 | Republican | BBC News | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 4 | Republican | BBC News | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
815 | 816 | Democrat | PBS | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
816 | 817 | Democrat | PBS | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
817 | 818 | Democrat | The Times of India | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
818 | 819 | Democrat | The Times of India | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
819 | 699 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
820 rows × 2361 columns
labels_news_party = news_data['Party'].to_list()
labels_news_publisher = news_data['publisher'].to_list()
news_data.drop(columns=['Unnamed: 0','Party','publisher'],inplace=True)
news_data.fillna(0,inplace=True)
news_data.head()
aapi | abandon | abandoned | abc | ability | able | abolish | abortion | absolutely | abuse | ... | yes | york | young | youth | zealot | zeldin | zero | zers | zone | zuckerberg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 2358 columns
Climate Bill Data
bills_data = pd.read_csv(r"C:\Users\natal\OneDrive\university\info 5653\data\Bills Lemmed- Count Vectorizer.csv")
labels_bills_billtype = bills_data['Bill Type']
labels_bills_sponser_affiliation = bills_data['Sponser Affiliation']
labels_bills_sponser_state = bills_data['Sponser State']
labels_bills_committees = bills_data['Committees']
bills_data.drop(columns=['Unnamed: 0','Bill Type','Sponser Affiliation','Sponser State','Committees'],inplace=True)
bills_data.fillna(0,inplace=True)
bills_data.head()
aa | aaa | aarhu | ab | abandon | abandonth | abat | abbrevi | abercrombi | abey | ... | zoe | zone | zonea | zonesnotwithstand | zoneth | zoo | zoolog | zoonot | zooplankton | zquez | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 15489 columns
Party Platform Data
party_data = pd.read_csv(r"C:\Users\natal\OneDrive\university\info 5653\data\Party Platform Lemmed- Count Vectorizer.csv")
labels_party_party = party_data['Party']
party_data.drop(columns=["Unnamed: 0","Party"],inplace=True)
party_data.fillna(0,inplace=True)
party_data.head()
ability | able | abortion | access | accessible | according | accountability | accountable | achieved | act | ... | won | word | work | worker | working | world | worship | worst | year | young | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 4 | 1 | 1 | 1 | 4 | 1 | 4 | ... | 1 | 1 | 4 | 15 | 2 | 15 | 2 | 1 | 7 | 7 |
1 | 7 | 13 | 13 | 72 | 15 | 1 | 6 | 14 | 1 | 88 | ... | 10 | 1 | 101 | 79 | 81 | 69 | 4 | 4 | 149 | 10 |
2 rows × 892 columns
1.3 Re- Count Vectorizing the Data
## Recreate CountVectorizer with the original vocabulary
vectorizer_news = CountVectorizer(vocabulary=news_data.columns)
vectorizer_bills = CountVectorizer(vocabulary=bills_data.columns)
vectorizer_party = CountVectorizer(vocabulary=party_data.columns)
2. LDA Topic Modeling
''' TESTING TOPIC MODELS '''
## First, creating a storage container to test numbers of topics:
topic_ns = []
for num in range (2,20,2):
topic_ns.append(num)
## Creating a storage container for the topics in the list:
def save_topics(model, vectorizer, top_n=10):
topic_contents = {}
## Iterating through the topics in the model components
for idx, topic in enumerate(model.components_):
## Extracting the top words:
top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-top_n - 1:-1]]
## Storing them:
topic_contents[idx] = top_words
return (topic_contents)
## Creating something to visualize the topics with:
def topic_visualizer(lda_model, dataset, topic_number,plot_title, fontsize=10):
word_topic = np.array(lda_model.components_).transpose()
vocab_array = np.asarray(dataset.columns.to_list())
fontsize_base = fontsize
plt.rcParams["font.family"] = "Times New Roman"
fig, axes = plt.subplots(1, topic_number, figsize=(3 * topic_number, 6),dpi=1000) # Adjust figure size
plt.suptitle(plot_title, fontsize=16, fontname="Times New Roman", fontweight="bold") # Main title
## Iterating and plotting the topics
for t in range(topic_number):
ax = plt.subplot(1, topic_number, t + 1) # Create subplot
ax.set_ylim(0, 15 + 0.5) # Stretch the y-axis to accommodate the words
ax.set_xticks([]) # Remove x-axis markings ('ticks')
ax.set_yticks([]) # Remove y-axis markings ('ticks')
ax.set_title(f'Topic #{t}', fontname="Times New Roman") # Set title font
# Change border (spine) colors to blue
for spine in ax.spines.values():
spine.set_edgecolor("blue") # Set border color
spine.set_linewidth(0.5) # Make the border thicker
top_words_idx = np.argsort(word_topic[:, t])[::-1] # Descending order
top_words_idx = top_words_idx[:15]
top_words = vocab_array[top_words_idx]
top_words_shares = word_topic[top_words_idx, t]
for i, (word, share) in enumerate(zip(top_words, top_words_shares)):
ax.text(0.3, 15 - i - 0.5, word, fontsize=fontsize_base, fontname="Times New Roman")
plt.tight_layout()
plt.show()
## And now creating a recursive tester
def lda_tester(topic_number,dataset,vectorizer,top_n,dataset_name):
## Instantiating a model:
lda_model = LatentDirichletAllocation(n_components=topic_number,max_iter=50, learning_method='online')
## Fitting the model:
fit_model = lda_model.fit_transform(dataset)
## Storing the contents of the topic model:
topic_contents = save_topics(lda_model, vectorizer,top_n)
plot_title = f"LDA for {topic_number} Clusters - {dataset_name}"
#topic_visualizer(lda_model, dataset,topic_number,plot_title,fontsize=10)
return (lda_model,topic_contents)
## And finally wrapping it into a loop
def lda_modeler(start_topics, end_topics,dataset,vectorizer,top_n,dataset_name):
dataset_topics = {}
for topic_set in tqdm(range (start_topics,end_topics,2),desc='🐜🐛... inching through data',leave=False,):
lda_model, lda_topics = lda_tester(topic_set,dataset,vectorizer,top_n,dataset_name)
dataset_topics[topic_set] = {"LDA MODEL":lda_model,"LDA TOPICS": lda_topics}
return (dataset_topics)
''' TESTING NEWS DATA'''
news_model = lda_modeler(6,20,news_data, vectorizer_news,15,'Climate News')
''' TESTING CLIMATE DATA '''
bills_model = lda_modeler(6,20,bills_data, vectorizer_bills,15,'Proposed Climate Bills')
''' TESTING PARTY DATA '''
party_model = lda_modeler(6,20,party_data, vectorizer_party,15,'2024 Party Platforms')
examining the outputs
news_model
{6: {'LDA MODEL': LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=6),
'LDA TOPICS': {0: ['trump',
'biden',
'new',
'energy',
'president',
'republican',
'wa',
'gas',
'house',
'people',
'oil',
'vote',
'fuel',
'drilling',
'time'],
1: ['wildfire',
'california',
'los',
'angeles',
'newsom',
'la',
'city',
'gavin',
'bass',
'mayor',
'state',
'democrat',
'home',
'post',
'ha'], ... (truncated) ...}
bills_model
{6: {'LDA MODEL': LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=6),
'LDA TOPICS': {0: ['substanc',
'chemic',
'wast',
'cover',
'facil',
'recycl',
'manufactur',
'materi',
'site',
'plastic',
'dispos',
'notic',
'mixtur',
'violat',
'discharg'], ... (truncated) ...}
party_model
{6: {'LDA MODEL': LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=6),
'LDA TOPICS': {0: ['republican',
'american',
'policy',
'border',
'restore',
'america',
'great',
'country',
'protect',
'education',
'support',
'right',
'common',
'people',
'government'], ... (truncated) ...}
visualizing the best topic models
bills_model_best = bills_model[18]['LDA MODEL']
topic_visualizer(bills_model_best,bills_data,18,"LDA Topic Modeling with 18 Clusters - Introduced Climate Bills",fontsize=10)

import numpy as np
import matplotlib.pyplot as plt
red_words = {"republican", "trump",'donald','musk','elon'}
blue_words = {"democrat", "biden",'joe','newsom'}
def topic_visualizer(lda_model, vectorizer, topic_number, plot_title, fontsize=10):
word_topic = np.array(lda_model.components_).transpose()
vocab_array = np.array(vectorizer.get_feature_names_out()) # Get vocab directly
fontsize_base = fontsize
plt.rcParams["font.family"] = "Times New Roman"
fig, axes = plt.subplots(1, topic_number, figsize=(3 * topic_number, 6), dpi=1000)
plt.suptitle(plot_title, fontsize=16, fontname="Times New Roman", fontweight="bold")
for t in range(topic_number):
ax = plt.subplot(1, topic_number, t + 1)
ax.set_ylim(0, 15 + 0.5)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title(f'Topic #{t}', fontname="Times New Roman")
for spine in ax.spines.values():
spine.set_edgecolor("blue")
spine.set_linewidth(0.5)
# Fix: Ensure indices are within vocab bounds
valid_indices = np.argsort(word_topic[:, t])[::-1] # Sort by importance
valid_indices = [idx for idx in valid_indices if idx < len(vocab_array)][:15] # Keep valid indices
top_words = vocab_array[valid_indices]
for i, word in enumerate(top_words):
color = "black"
if word.lower() in red_words:
color = "red"
elif word.lower() in blue_words:
color = "blue"
ax.text(0.3, 15 - i - 0.5, word, fontsize=fontsize_base, fontname="Times New Roman", color=color)
plt.tight_layout()
plt.show()
party_model_best = party_model[8]['LDA MODEL']
topic_visualizer(party_model_best,vectorizer_party,8,"LDA Topic Modeling with 8 Clusters - 2024 Partisan Platforms",fontsize=10)

news_model_best = news_model[8]['LDA MODEL']
topic_visualizer(news_model_best, vectorizer_news, 8,"LDA Topic Modeling with 8 Clusters - Climate Related News", fontsize=10)

creating a special visualizer for the longer topic models
import numpy as np
import matplotlib.pyplot as plt
import base64
from io import BytesIO
def topic_visualizer_scrollable(lda_model, dataset, topic_number, plot_title, fontsize=10, output_file="topic_visualizer.html"):
word_topic = np.array(lda_model.components_).transpose()
vocab_array = np.asarray(dataset.columns.to_list())
fontsize_base = fontsize
plt.rcParams["font.family"] = "Times New Roman"
fig, axes = plt.subplots(1, topic_number, figsize=(2 * topic_number, 6), dpi=1000)
for t in range(topic_number):
ax = plt.subplot(1, topic_number, t + 1)
ax.set_ylim(0, 15 + 0.5)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title(f'Topic #{t}', fontname="Times New Roman")
for spine in ax.spines.values():
spine.set_edgecolor("blue")
spine.set_linewidth(0.5)
top_words_idx = np.argsort(word_topic[:, t])[::-1][:min(15, len(vocab_array))]
top_words = vocab_array[top_words_idx]
for i, word in enumerate(top_words):
ax.text(0.3, 15 - i - 0.5, word, fontsize=fontsize_base, fontname="Times New Roman")
plt.tight_layout()
# Save figure to a BytesIO object
buf = BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight", dpi=300)
plt.close(fig)
# Convert image to Base64
encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
img_html = f'<img src="data:image/png;base64,{encoded}" style="max-height:600px;">'
# HTML template with scrollable container
html_template = f"""
<html>
<head>
<style>
.scroll-container
</style>
</head>
<body>
<h2 style="font-family: 'Times New Roman'; text-align: center;">{plot_title}</h2>
<div class="scroll-container">
{img_html}
</div>
</body>
</html>
"""
# Save HTML file
with open(output_file, "w", encoding="utf-8") as f:
f.write(html_template)
print(f"HTML file saved: {output_file}")
topic_visualizer_scrollable(bills_model_best,bills_data,18,'LDA Topic Modeling with 18 Clusters - Introduced Climate Policy',10,'lda_climate_bills.html')
HTML file saved: lda_climate_bills.html
3. Creating Intertopic Distance Maps
def lda_visualizer(data,model,vectorizer,filename):
''' PREPARARING THE DATA TO FIT '''
doc_lengths_sparse = data.sum(axis=1)
## converting to an array
doc_lengths = np.asarray(doc_lengths_sparse).flatten()
## normalizing the distributions from the model
topic_term_distributions = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
## extracting the documnet - topic distributions
document_topic_distributions = model.transform(data)
## extracting the vocabulary from the vectorizer
vocabulary = vectorizer.get_feature_names_out()
## extracting the term frequencies
term_frequencies = np.asarray(data.sum(axis=0)).flatten()
''' GENERATING THE VISUALIZATION'''
visualization = pyLDAvis.prepare(topic_term_dists=topic_term_distributions,
doc_topic_dists=document_topic_distributions,
doc_lengths=doc_lengths,
vocab=vocabulary,
term_frequency=term_frequencies)
''' SAVING THE VISUALIZATION'''
saving_filename = filename+".html"
pyLDAvis.save_html(visualization, saving_filename)
lda_visualizer(bills_data,bills_model_best,vectorizer_bills,'LDA Interactive Topics - Bills')
lda_visualizer(party_data,party_model_best,vectorizer_party,'LDA Interactive Topics - Party')
lda_visualizer(news_data,news_model_best,vectorizer_news,'LDA Interactive Topics - News')