Scripts
Here you can find the scripts we used in our projects. Extra explanation for the code is written behind the # in the code.
Code for sentiment analysis
import MeCab as mc
import pandas as pd
import numpy as np
import os
from collections import Counter
# Import the dataset with scores
path_dic = os.path.sep.join(['dic', 'pn_ja_columnNames.dic'])
df_pn = pd.read_csv(path_dic, encoding="utf8", sep=",", usecols=['lemma','score'])
#Setting up MeCab
tokenizer = mc.Tagger("unidic-kindai-bungo-v202512")
mecab = mc.Tagger()
# The dataset to which we want to add the scores aat the end
nd = pd.read_csv("wpc-only-needed-timeframe.csv")
# Dataframe with only the column of filenames. Useful to iterate over to get the link to the files with full texts.
filenames = pd.read_csv("wpc-only-needed-timeframe.csv", encoding="utf8", usecols=["Tokenized Filename"])
# Empty arrays which will later be added to the nd dataframe
scores_nouns = []
scores_verbs = []
scores_adjectives = []
# The link to the folder with the files of the books
link = "C:/Users/Ariana/wpc/database adding/tokenized/"
# Variable to iterate over the rows
i = 0
# Start of the for loop that iterates over all the rows of the filenames dataframe
for row in filenames.iterrows():
# Extract the filename on location i. Need to make this into a string, and replace a part of it to have only the filename left over.
filename = filenames.loc[i].to_string().replace("Tokenized Filename ", "")
textFile = link + filename
# Open the file with link textFile, and replace spaces with nothing.
sent = open(textFile, encoding="utf8").read().replace(" ", "")
# Parse the text with MeCab, and write it to a variable node.
node = tokenizer.parseToNode(sent)
# Empty arrays that will eventually be added to the arrays defined before this loop.
nouns_toadd = []
verbs_toadd = []
adjectives_toadd = []
nscores = []
vscores = []
ascores = []
# For every element in node
while node:
# Look for nouns, verbs, and adjectives respectively, and add them to their respective arrays
if node.feature.split(",")[0] == u"名詞":
nouns_toadd.append(node.surface)
elif node.feature.split(",")[0] == u"動詞":
verbs_toadd.append(node.feature.split(",")[7])
elif node.feature.split(",")[0] == u"形容詞":
adjectives_toadd.append(node.feature.split(",")[7])
# Go to the next node.
node = node.next
# Get the scores for each word found in each of the arrays, and append them to the scores array.
for noun in nouns_toadd:
if noun in df_pn['lemma'].values:
score = df_pn.loc[df_pn['lemma'] == noun, 'score'].iloc[0]
nscores.append(score)
for verb in verbs_toadd:
if verb in df_pn['lemma'].values:
score = df_pn.loc[df_pn['lemma'] == verb, 'score'].iloc[0]
vscores.append(score)
for adjective in adjectives_toadd:
if adjective in df_pn['lemma'].values:
score = df_pn.loc[df_pn['lemma'] == adjective, 'score'].iloc[0]
ascores.append(score)
# Append the 30 most common scores to the arrays defined outside of this loop
scores_nouns.append(Counter(nscores).most_common(30))
scores_verbs.append(Counter(vscores).most_common(30))
scores_adjectives.append(Counter(ascores).most_common(30))
# Print to keep track of where we are in the loop, and increment i by 1
print(i)
i += 1
# Add the arrays with most common scores to the original dataset, and save that to a new one.
nd["Noun Scores"] = scores_nouns
nd["Verb Scores"] = scores_verbs
nd["Adjective Scores"] = scores_adjectives
nd.to_csv("ds_with_scores.csv", index=False)
Code for visualisation
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.animation as animation
import pandas as pd
# Read the dataset with necessary columns only
df = pd.read_csv("dataset-with-seperated-scores.csv", encoding="utf8", usecols=["底本初版発行年1",
"Noun Scores", "Noun Score count", "Verb Scores", "Verb Score count", "Adjective Scores", "Adjective Score count"])
# The starting year of our timeframe of interest
year = 1920
# Make an array of titles, needed to have a unique title for each frame in the animation
titles = ["year {}".format(frame) for frame in range(1920, 1960)]
# Function we will call for the animation. The animation will apply this function for each frame.
def func(frame, ax, a, titles):
# Reference the global variable year, defined outside of the function
global year
# Clear the axis so that each frame draws a new plot
ax.cla()
# Get the needed columns for the specific year, and from that new dataframe, take the columns we wish to plot
b = a.loc[a["底本初版発行年1"] == year]
ac = b["Noun Scores"]
dc = b["Noun Score count"]
# Set the title from the array of titles, and plot them in a bar plot
ax.set_title(titles[frame])
ax.bar(ac, dc, width=0.05)
# Increment year by 1 for the next frame
year += 1
return ax, year
# Setting up for plotting
fig = plt.figure()
ax = fig.add_subplot(111)
# How many frames the animation will have
frames = range(40)
# Run the animation, and write it to a file
ani = animation.FuncAnimation(fig, func, frames, interval=1000, repeat_delay=1000, blit=False, fargs=(ax, df, titles))
ani.save(filename="nouns.gif", writer="pillow")