5. Latent Dirichlet Allocation

1. Environment Creation:

1.1 Library Import

''' DATA MANAGEMENT '''
import pandas as pd
import regex as re
import numpy as np
import os
from gensim import corpora
from gensim.corpora import Dictionary

''' TEXT PROCESSING '''
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


''' DATA VISUALIZATION '''
import seaborn as sb
from wordcloud import WordCloud
import matplotlib.pyplot as plt

''' LDA VIS'''
import pyLDAvis

''' SANITY '''
from tqdm import tqdm
C:\Users\natal\miniconda3\lib\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (

1.2 Data Import

news_data = pd.read_csv(r"C:\Users\natal\OneDrive\university\info 5653\data\News Articles Lemmed- Count Vectorizer.csv")
news_data
Unnamed: 0 Party publisher aapi abandon abandoned abc ability able abolish ... yes york young youth zealot zeldin zero zers zone zuckerberg
0 0 Republican The Verge 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 1 Republican Gizmodo.com 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 3 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 4 Republican BBC News 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
815 816 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
816 817 Democrat PBS 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
817 818 Democrat The Times of India 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
818 819 Democrat The Times of India NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
819 699 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

820 rows × 2361 columns

labels_news_party = news_data['Party'].to_list()
labels_news_publisher = news_data['publisher'].to_list()
news_data.drop(columns=['Unnamed: 0','Party','publisher'],inplace=True)
news_data.fillna(0,inplace=True)
news_data.head()
aapi abandon abandoned abc ability able abolish abortion absolutely abuse ... yes york young youth zealot zeldin zero zers zone zuckerberg
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 2358 columns

Climate Bill Data

bills_data = pd.read_csv(r"C:\Users\natal\OneDrive\university\info 5653\data\Bills Lemmed- Count Vectorizer.csv")
labels_bills_billtype = bills_data['Bill Type']
labels_bills_sponser_affiliation = bills_data['Sponser Affiliation']
labels_bills_sponser_state = bills_data['Sponser State']
labels_bills_committees = bills_data['Committees']
bills_data.drop(columns=['Unnamed: 0','Bill Type','Sponser Affiliation','Sponser State','Committees'],inplace=True)
bills_data.fillna(0,inplace=True)
bills_data.head()
aa aaa aarhu ab abandon abandonth abat abbrevi abercrombi abey ... zoe zone zonea zonesnotwithstand zoneth zoo zoolog zoonot zooplankton zquez
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 15489 columns

Party Platform Data

party_data = pd.read_csv(r"C:\Users\natal\OneDrive\university\info 5653\data\Party Platform Lemmed- Count Vectorizer.csv")
labels_party_party = party_data['Party']
party_data.drop(columns=["Unnamed: 0","Party"],inplace=True)
party_data.fillna(0,inplace=True)
party_data.head()
ability able abortion access accessible according accountability accountable achieved act ... won word work worker working world worship worst year young
0 1 1 1 4 1 1 1 4 1 4 ... 1 1 4 15 2 15 2 1 7 7
1 7 13 13 72 15 1 6 14 1 88 ... 10 1 101 79 81 69 4 4 149 10

2 rows × 892 columns

1.3 Re- Count Vectorizing the Data


## Recreate CountVectorizer with the original vocabulary
vectorizer_news = CountVectorizer(vocabulary=news_data.columns)
vectorizer_bills = CountVectorizer(vocabulary=bills_data.columns)
vectorizer_party = CountVectorizer(vocabulary=party_data.columns)

2. LDA Topic Modeling

''' TESTING TOPIC MODELS '''

## First, creating a storage container to test numbers of topics:

topic_ns = []
for num in range (2,20,2):
    topic_ns.append(num)
    
## Creating a storage container for the topics in the list:
def save_topics(model, vectorizer, top_n=10):
    topic_contents = {}
    
    ## Iterating through the topics in the model components
    for idx, topic in enumerate(model.components_):
        
        ## Extracting the top words:
        top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-top_n - 1:-1]]
        
        ## Storing them: 
        topic_contents[idx] = top_words
        
    return (topic_contents)

## Creating something to visualize the topics with:
def topic_visualizer(lda_model, dataset, topic_number,plot_title, fontsize=10):
    
    word_topic = np.array(lda_model.components_).transpose()
    
    vocab_array = np.asarray(dataset.columns.to_list())
    
    fontsize_base = fontsize
    plt.rcParams["font.family"] = "Times New Roman"  
    
    fig, axes = plt.subplots(1, topic_number, figsize=(3 * topic_number, 6),dpi=1000)  # Adjust figure size
    plt.suptitle(plot_title, fontsize=16, fontname="Times New Roman", fontweight="bold")  # Main title

    
    ## Iterating and plotting the topics
    for t in range(topic_number):
        ax = plt.subplot(1, topic_number, t + 1)  # Create subplot
        ax.set_ylim(0, 15 + 0.5)  # Stretch the y-axis to accommodate the words
        ax.set_xticks([])  # Remove x-axis markings ('ticks')
        ax.set_yticks([])  # Remove y-axis markings ('ticks')
        ax.set_title(f'Topic #{t}', fontname="Times New Roman")  # Set title font
        
        # Change border (spine) colors to blue
        for spine in ax.spines.values():
            spine.set_edgecolor("blue")  # Set border color
            spine.set_linewidth(0.5)  # Make the border thicker

        top_words_idx = np.argsort(word_topic[:, t])[::-1]  # Descending order
        top_words_idx = top_words_idx[:15]
        top_words = vocab_array[top_words_idx]
        top_words_shares = word_topic[top_words_idx, t]

        for i, (word, share) in enumerate(zip(top_words, top_words_shares)):
            ax.text(0.3, 15 - i - 0.5, word, fontsize=fontsize_base, fontname="Times New Roman")


    plt.tight_layout()
    plt.show()

## And now creating a recursive tester
def lda_tester(topic_number,dataset,vectorizer,top_n,dataset_name):
    ## Instantiating a model:
    
    lda_model = LatentDirichletAllocation(n_components=topic_number,max_iter=50, learning_method='online')
    
    ## Fitting the model:
    fit_model = lda_model.fit_transform(dataset)
    
    ## Storing the contents of the topic model:
    topic_contents = save_topics(lda_model, vectorizer,top_n)

    plot_title = f"LDA for {topic_number} Clusters - {dataset_name}"
    #topic_visualizer(lda_model, dataset,topic_number,plot_title,fontsize=10)
    
    return (lda_model,topic_contents)

## And finally wrapping it into a loop
def lda_modeler(start_topics, end_topics,dataset,vectorizer,top_n,dataset_name):
    
    dataset_topics = {}
    
    for topic_set in tqdm(range (start_topics,end_topics,2),desc='🐜🐛... inching through data',leave=False,):
        lda_model, lda_topics = lda_tester(topic_set,dataset,vectorizer,top_n,dataset_name)
        
        dataset_topics[topic_set] = {"LDA MODEL":lda_model,"LDA TOPICS": lda_topics}
        
    return (dataset_topics)
''' TESTING NEWS DATA'''
news_model  = lda_modeler(6,20,news_data, vectorizer_news,15,'Climate News')
''' TESTING CLIMATE DATA '''
bills_model  = lda_modeler(6,20,bills_data, vectorizer_bills,15,'Proposed Climate Bills')
''' TESTING PARTY DATA '''
party_model  = lda_modeler(6,20,party_data, vectorizer_party,15,'2024 Party Platforms')

examining the outputs

news_model
{6: {'LDA MODEL': LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=6),
  'LDA TOPICS': {0: ['trump',
    'biden',
    'new',
    'energy',
    'president',
    'republican',
    'wa',
    'gas',
    'house',
    'people',
    'oil',
    'vote',
    'fuel',
    'drilling',
    'time'],
   1: ['wildfire',
    'california',
    'los',
    'angeles',
    'newsom',
    'la',
    'city',
    'gavin',
    'bass',
    'mayor',
    'state',
    'democrat',
    'home',
    'post',
    'ha'], ... (truncated) ...}
bills_model
{6: {'LDA MODEL': LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=6),
  'LDA TOPICS': {0: ['substanc',
    'chemic',
    'wast',
    'cover',
    'facil',
    'recycl',
    'manufactur',
    'materi',
    'site',
    'plastic',
    'dispos',
    'notic',
    'mixtur',
    'violat',
    'discharg'], ... (truncated) ...}
party_model
{6: {'LDA MODEL': LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=6),
  'LDA TOPICS': {0: ['republican',
    'american',
    'policy',
    'border',
    'restore',
    'america',
    'great',
    'country',
    'protect',
    'education',
    'support',
    'right',
    'common',
    'people',
    'government'], ... (truncated) ...}

visualizing the best topic models

bills_model_best = bills_model[18]['LDA MODEL']
topic_visualizer(bills_model_best,bills_data,18,"LDA Topic Modeling with 18 Clusters - Introduced Climate Bills",fontsize=10)
import numpy as np
import matplotlib.pyplot as plt

red_words = {"republican", "trump",'donald','musk','elon'}
blue_words = {"democrat", "biden",'joe','newsom'}

def topic_visualizer(lda_model, vectorizer, topic_number, plot_title, fontsize=10):
    word_topic = np.array(lda_model.components_).transpose()
    vocab_array = np.array(vectorizer.get_feature_names_out())  # Get vocab directly

    fontsize_base = fontsize
    plt.rcParams["font.family"] = "Times New Roman"

    fig, axes = plt.subplots(1, topic_number, figsize=(3 * topic_number, 6), dpi=1000)
    plt.suptitle(plot_title, fontsize=16, fontname="Times New Roman", fontweight="bold")

    for t in range(topic_number):
        ax = plt.subplot(1, topic_number, t + 1)
        ax.set_ylim(0, 15 + 0.5)
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title(f'Topic #{t}', fontname="Times New Roman")

        for spine in ax.spines.values():
            spine.set_edgecolor("blue")
            spine.set_linewidth(0.5)

        # Fix: Ensure indices are within vocab bounds
        valid_indices = np.argsort(word_topic[:, t])[::-1]  # Sort by importance
        valid_indices = [idx for idx in valid_indices if idx < len(vocab_array)][:15]  # Keep valid indices
        top_words = vocab_array[valid_indices]

        for i, word in enumerate(top_words):
            color = "black"
            if word.lower() in red_words:
                color = "red"
            elif word.lower() in blue_words:
                color = "blue"

            ax.text(0.3, 15 - i - 0.5, word, fontsize=fontsize_base, fontname="Times New Roman", color=color)

    plt.tight_layout()
    plt.show()

party_model_best = party_model[8]['LDA MODEL']
topic_visualizer(party_model_best,vectorizer_party,8,"LDA Topic Modeling with 8 Clusters - 2024 Partisan Platforms",fontsize=10)
news_model_best = news_model[8]['LDA MODEL']
topic_visualizer(news_model_best, vectorizer_news, 8,"LDA Topic Modeling with 8 Clusters - Climate Related News", fontsize=10)

creating a special visualizer for the longer topic models

import numpy as np
import matplotlib.pyplot as plt
import base64
from io import BytesIO

def topic_visualizer_scrollable(lda_model, dataset, topic_number, plot_title, fontsize=10, output_file="topic_visualizer.html"):
    
    word_topic = np.array(lda_model.components_).transpose()
    vocab_array = np.asarray(dataset.columns.to_list())

    fontsize_base = fontsize
    plt.rcParams["font.family"] = "Times New Roman"  

    fig, axes = plt.subplots(1, topic_number, figsize=(2 * topic_number, 6), dpi=1000)
    
    for t in range(topic_number):
        ax = plt.subplot(1, topic_number, t + 1)
        ax.set_ylim(0, 15 + 0.5)
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title(f'Topic #{t}', fontname="Times New Roman")

        for spine in ax.spines.values():
            spine.set_edgecolor("blue")
            spine.set_linewidth(0.5)

        top_words_idx = np.argsort(word_topic[:, t])[::-1][:min(15, len(vocab_array))]
        top_words = vocab_array[top_words_idx]

        for i, word in enumerate(top_words):
            ax.text(0.3, 15 - i - 0.5, word, fontsize=fontsize_base, fontname="Times New Roman")

    plt.tight_layout()

    # Save figure to a BytesIO object
    buf = BytesIO()
    plt.savefig(buf, format="png", bbox_inches="tight", dpi=300)
    plt.close(fig)

    # Convert image to Base64
    encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
    img_html = f'<img src="data:image/png;base64,{encoded}" style="max-height:600px;">'

    # HTML template with scrollable container
    html_template = f"""
    <html>
    <head>
        <style>
            .scroll-container 
        </style>
    </head>
    <body>
        <h2 style="font-family: 'Times New Roman'; text-align: center;">{plot_title}</h2>
        <div class="scroll-container">
            {img_html}
        </div>
    </body>
    </html>
    """

    # Save HTML file
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html_template)

    print(f"HTML file saved: {output_file}")


topic_visualizer_scrollable(bills_model_best,bills_data,18,'LDA Topic Modeling with 18 Clusters - Introduced Climate Policy',10,'lda_climate_bills.html')
HTML file saved: lda_climate_bills.html

3. Creating Intertopic Distance Maps

def lda_visualizer(data,model,vectorizer,filename):
    
    ''' PREPARARING THE DATA TO FIT '''
    doc_lengths_sparse = data.sum(axis=1)
    
    ## converting to an array
    doc_lengths = np.asarray(doc_lengths_sparse).flatten()
    
    ## normalizing the distributions from the model
    topic_term_distributions = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
    
    ## extracting the documnet - topic distributions
    document_topic_distributions = model.transform(data)
    
    ## extracting the vocabulary from the vectorizer
    vocabulary = vectorizer.get_feature_names_out()
    
    ## extracting the term frequencies
    term_frequencies = np.asarray(data.sum(axis=0)).flatten()
    
    ''' GENERATING THE VISUALIZATION'''
    visualization = pyLDAvis.prepare(topic_term_dists=topic_term_distributions,
                                    doc_topic_dists=document_topic_distributions,
                                    doc_lengths=doc_lengths,
                                    vocab=vocabulary,
                                    term_frequency=term_frequencies)
    
    ''' SAVING THE VISUALIZATION'''
    
    saving_filename = filename+".html"
    pyLDAvis.save_html(visualization, saving_filename)

    
lda_visualizer(bills_data,bills_model_best,vectorizer_bills,'LDA Interactive Topics - Bills')
lda_visualizer(party_data,party_model_best,vectorizer_party,'LDA Interactive Topics - Party')
lda_visualizer(news_data,news_model_best,vectorizer_news,'LDA Interactive Topics - News')