Warandwords scripts

Scripts

Here you can find the scripts we used in our projects. Extra explanation for the code is written behind the # in the code.

Code for sentiment analysis

    import MeCab as mc
    import pandas as pd
    import numpy as np
    import os
    from collections import Counter

    # Import the dataset with scores
    path_dic = os.path.sep.join(['dic', 'pn_ja_columnNames.dic'])
    df_pn = pd.read_csv(path_dic, encoding="utf8", sep=",", usecols=['lemma','score'])


    #Setting up MeCab
    tokenizer = mc.Tagger("unidic-kindai-bungo-v202512")
    mecab = mc.Tagger()

    # The dataset to which we want to add the scores aat the end
    nd = pd.read_csv("wpc-only-needed-timeframe.csv")

    # Dataframe with only the column of filenames. Useful to iterate over to get the link to the files with full texts.
    filenames = pd.read_csv("wpc-only-needed-timeframe.csv", encoding="utf8", usecols=["Tokenized Filename"])

    # Empty arrays which will later be added to the nd dataframe
    scores_nouns = []
    scores_verbs = []
    scores_adjectives = []

    # The link to the folder with the files of the books
    link = "C:/Users/Ariana/wpc/database adding/tokenized/"

    # Variable to iterate over the rows
    i = 0

    # Start of the for loop that iterates over all the rows of the filenames dataframe
    for row in filenames.iterrows():

        # Extract the filename on location i. Need to make this into a string, and replace a part of it to have only the filename left over.
        filename = filenames.loc[i].to_string().replace("Tokenized Filename    ", "")
        textFile = link + filename

        # Open the file with link textFile, and replace spaces with nothing.
        sent = open(textFile, encoding="utf8").read().replace(" ", "")

        # Parse the text with MeCab, and write it to a variable node.
        node = tokenizer.parseToNode(sent)

        # Empty arrays that will eventually be added to the arrays defined before this loop.
        nouns_toadd = []
        verbs_toadd = []
        adjectives_toadd = []

        nscores = []
        vscores = []
        ascores = []

        # For every element in node
        while node:
            # Look for nouns, verbs, and adjectives respectively, and add them to their respective arrays
            if node.feature.split(",")[0] == u"名詞":
                nouns_toadd.append(node.surface)

            elif node.feature.split(",")[0] == u"動詞":
                verbs_toadd.append(node.feature.split(",")[7])

            elif node.feature.split(",")[0] == u"形容詞":
                adjectives_toadd.append(node.feature.split(",")[7])

            # Go to the next node.
            node = node.next

        # Get the scores for each word found in each of the arrays, and append them to the scores array.
        for noun in nouns_toadd:
            if noun in df_pn['lemma'].values:
                score = df_pn.loc[df_pn['lemma'] == noun, 'score'].iloc[0]
                nscores.append(score)


        for verb in verbs_toadd:
            if verb in df_pn['lemma'].values:
                score = df_pn.loc[df_pn['lemma'] == verb, 'score'].iloc[0]
                vscores.append(score)

        for adjective in adjectives_toadd:
            if adjective in df_pn['lemma'].values:
                score = df_pn.loc[df_pn['lemma'] == adjective, 'score'].iloc[0]
                ascores.append(score)


        # Append the 30 most common scores to the arrays defined outside of this loop
        scores_nouns.append(Counter(nscores).most_common(30))
        scores_verbs.append(Counter(vscores).most_common(30))
        scores_adjectives.append(Counter(ascores).most_common(30))

        # Print to keep track of where we are in the loop, and increment i by 1
        print(i)
        i += 1

    # Add the arrays with most common scores to the original dataset, and save that to a new one.
    nd["Noun Scores"] = scores_nouns
    nd["Verb Scores"] = scores_verbs
    nd["Adjective Scores"] = scores_adjectives

    nd.to_csv("ds_with_scores.csv", index=False)

Code for visualisation

    import matplotlib.pyplot as plt
    import numpy as np
    import matplotlib.animation as animation
    import pandas as pd

    # Read the dataset with necessary columns only
    df = pd.read_csv("dataset-with-seperated-scores.csv", encoding="utf8", usecols=["底本初版発行年1",
     "Noun Scores", "Noun Score count", "Verb Scores", "Verb Score count", "Adjective Scores", "Adjective Score count"])

    # The starting year of our timeframe of interest
    year = 1920

    # Make an array of titles, needed to have a unique title for each frame in the animation
    titles = ["year {}".format(frame) for frame in range(1920, 1960)]

    # Function we will call for the animation. The animation will apply this function for each frame.
    def func(frame, ax, a, titles):

        # Reference the global variable year, defined outside of the function
        global year

        # Clear the axis so that each frame draws a new plot
        ax.cla()

        # Get the needed columns for the specific year, and from that new dataframe, take the columns we wish to plot
        b = a.loc[a["底本初版発行年1"] == year]
        ac = b["Noun Scores"]
        dc = b["Noun Score count"]

        # Set the title from the array of titles, and plot them in a bar plot
        ax.set_title(titles[frame])
        ax.bar(ac, dc, width=0.05)

        # Increment year by 1 for the next frame
        year += 1
        return ax, year

    # Setting up for plotting
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # How many frames the animation will have
    frames = range(40)

    # Run the animation, and write it to a file
    ani = animation.FuncAnimation(fig, func, frames, interval=1000, repeat_delay=1000, blit=False, fargs=(ax, df, titles))

    ani.save(filename="nouns.gif", writer="pillow")