Code for SVG // svg.html
<script> // lovecraft data from adjFILTERED.tsv
const lovecraftData = [
{ word: "wild", synset: 17 },
{ word: "deep", synset: 21 },
{ word: "dark", synset: 16 },
{ word: "low", synset: 16 },
{ word: "good", synset: 27 },
{ word: "open", synset: 36 },
{ word: "blue", synset: 16 },
{ word: "big", synset: 17 },
{ word: "soft", synset: 20 },
{ word: "bad", synset: 17 },
{ word: "last", synset: 21 },
{ word: "dead", synset: 21 },
{ word: "black", synset: 22 },
{ word: "white", synset: 25 },
{ word: "hot", synset: 21 },
{ word: "fit", synset: 16 },
{ word: "cold", synset: 16 },
{ word: "right", synset: 36 },
{ word: "short", synset: 23 },
{ word: "heavy", synset: 30 },
{ word: "well", synset: 22 },
{ word: "good", synset: 27 },
{ word: "high", synset: 18 },
{ word: "dry", synset: 19 },
{ word: "live", synset: 19 },
{ word: "solid", synset: 18 },
{ word: "clear", synset: 45 },
{ word: "close", synset: 37 },
{ word: "second", synset: 15 },
{ word: "present", synset: 18 },
{ word: "heavy", synset: 30 },
{ word: "advanced", synset: 20 },
{ word: "back", synset: 28 },
{ word: "true", synset: 15 },
{ word: "dull", synset: 19 },
{ word: "sharp", synset: 15 },
{ word: "standing", synset: 21 },
{ word: "last", synset: 21 },
{ word: "foul", synset: 16 },
{ word: "regular", synset: 17 },
{ word: "closing", synset: 23 },
{ word: "darker", synset: 45 },
{ word: "wild", synset: 17 },
{ word: "light", synset: 47 },
{ word: "short", synset: 23 },
{ word: "rough", synset: 18 },
{ word: "first", synset: 16 },
{ word: "loose", synset: 18 },
{ word: "separate", synset: 19 },
{ word: "upset", synset: 17 },
{ word: "closed", synset: 26 },
{ word: "gray", synset: 15 },
{ word: "dull", synset: 19 },
{ word: "blue", synset: 16 },
{ word: "flat", synset: 24 },
{ word: "fair", synset: 17 },
{ word: "frozen", synset: 17 },
{ word: "top", synset: 22 },
{ word: "hard", synset: 22 },
{ word: "soft", synset: 20 },
{ word: "left", synset: 24 },
{ word: "right", synset: 36 },
{ word: "sound", synset: 24 },
{ word: "free", synset: 22 },
{ word: "crossing", synset: 15 },
];
// PoeData from adjFILTERED.tsv
const poeData = [
{ word: "dark", synset: 16 },
{ word: "open", synset: 36 },
{ word: "well", synset: 22 },
{ word: "true", synset: 15 },
{ word: "good", synset: 27 },
{ word: "high", synset: 18 },
{ word: "last", synset: 21 },
{ word: "close", synset: 37 },
{ word: "clear", synset: 45 },
{ word: "first", synset: 16 },
{ word: "present", synset: 18 },
{ word: "square", synset: 25 },
{ word: "puff", synset: 17 },
{ word: "deep", synset: 21 },
{ word: "right", synset: 36 },
{ word: "closed", synset: 26 },
{ word: "second", synset: 15 },
{ word: "short", synset: 23 },
{ word: "true", synset: 15 },
{ word: "blue", synset: 16 },
{ word: "broken", synset: 22 },
{ word: "directed", synset: 15 },
{ word: "wild", synset: 17 },
{ word: "easy", synset: 15 },
{ word: "fair", synset: 17 },
{ word: "black", synset: 22 },
{ word: "high", synset: 18 },
{ word: "sound", synset: 24 },
{ word: "regular", synset: 17 },
{ word: "solid", synset: 18 },
{ word: "flat", synset: 24 },
{ word: "white", synset: 25 },
{ word: "fitting", synset: 15 },
{ word: "tight", synset: 16 },
{ word: "square", synset: 25 },
{ word: "heavy", synset: 30 },
{ word: "solid", synset: 18 },
{ word: "free", synset: 22 },
{ word: "sweet", synset: 16 },
{ word: "big", synset: 17 },
{ word: "soft", synset: 20 },
{ word: "hard", synset: 22 },
{ word: "separate", synset: 19 },
{ word: "direct", synset: 24 },
{ word: "double", synset: 21 },
{ word: "low", synset: 16 },
{ word: "dead", synset: 21 },
{ word: "light", synset: 47 },
{ word: "flush", synset: 18 },
{ word: "sharp", synset: 15 },
{ word: "round", synset: 25 },
{ word: "loose", synset: 18 },
{ word: "wild", synset: 17 },
{ word: "rough", synset: 18 },
{ word: "extended", synset: 22 },
{ word: "easy", synset: 15 },
{ word: "cold", synset: 16 },
{ word: "marked", synset: 18 },
{ word: "subject", synset: 15 },
{ word: "good", synset: 27 },
{ word: "clear", synset: 45 },
{ word: "dull", synset: 19 },
{ word: "gray", synset: 15 },
{ word: "last", synset: 21 },
{ word: "still", synset: 18 },
{ word: "striking", synset: 25 },
{ word: "straight", synset: 21 },
{ word: "frozen", synset: 17 },
{ word: "striped", synset: 15 },
{ word: "free", synset: 22 },
{ word: "direct", synset: 24 },
{ word: "last", synset: 21 },
{ word: "fast", synset: 15 },
{ word: "covered", synset: 27 },
{ word: "fair", synset: 17 },
{ word: "round", synset: 25 },
{ word: "hot", synset: 21 },
{ word: "fit", synset: 16 },
{ word: "wilder", synset: 15 },
{ word: "bad", synset: 17 },
];
const sharedWords = Array.from(new Set(lovecraftData.map(d => d.word)))
.filter(word => poeData.some(p => p.word === word));
const svg = document.getElementById("bubbleChart");
const toggleBtn = document.getElementById("toggleBtn");
let showSharedOnly = false;
toggleBtn.addEventListener("click", () => {
showSharedOnly = !showSharedOnly;
toggleBtn.textContent = showSharedOnly ? "Show All Words" : "Show Shared Words Only";
renderBubbles(lovecraftData, poeData);
});
function draw(data, offsetX, color1, color2) {
data.forEach((item, i) => {
const group = document.createElementNS("http://www.w3.org/2000/svg", "g");
group.setAttribute("class", "bubble");
const radius = item.synset;
const col = i % 4; //4 col
const row = Math.floor(i / 4); //4 row
const x = offsetX + col * 150;
const y = 100 + row * 130;
const circle = document.createElementNS("http://www.w3.org/2000/svg", "circle");
circle.setAttribute("cx", x);
circle.setAttribute("cy", y);
circle.setAttribute("r", radius);
circle.setAttribute("fill", color1);
circle.setAttribute("stroke", color2);
circle.setAttribute("stroke-width", "2");
const title = document.createElementNS("http://www.w3.org/2000/svg", "title");
title.textContent = `${item.word} (Synset: ${item.synset})`;
circle.appendChild(title); //moved inside circle instead of group
const text = document.createElementNS("http://www.w3.org/2000/svg", "text");
text.setAttribute("x", x);
text.setAttribute("y", y + 4);
text.setAttribute("text-anchor", "middle");
text.textContent = item.word;
group.appendChild(circle);
group.appendChild(text);
svg.appendChild(group);
maxWidth = Math.max(maxWidth, x + radius * 2);
maxHeight = Math.max(maxHeight, y + radius * 2);
});
}
function renderBubbles(lovecraft, poe) {
svg.innerHTML = "";
maxWidth = 0;
maxHeight = 0;
if (showSharedOnly) {
const shared = lovecraft.filter(d => sharedWords.includes(d.word));
draw(shared, 200, "#facc15", "#eab308"); // Yellow shared
} else {
draw(lovecraft, 150, "#69e864", "#339f0e"); // Green Lovecraft
draw(poe, 750, "#c084fc", "#9333ea"); // Purple Poe
}
svg.setAttribute("viewBox", `0 0 ${maxWidth + 100} ${maxHeight + 100}`);
svg.setAttribute("preserveAspectRatio", "xMidYMin meet");
}
renderBubbles(lovecraftData, poeData);
</script>
Python to create the Cytoscape // words.html
import os
import spacy
from nltk.corpus import wordnet as wn
import pandas as pd
nlp = spacy.load("en_core_web_md")
collPath = 'corpora'
def wordCollector(words, unit):
wordList = []
nodeAtts = []
unitList = []
synsetCounts = []
for token in words:
if token.pos_ == "ADJ" and len(wn.synsets(token.lemma_)) >= 15:
synsets = len(wn.synsets(token.lemma_))
wordList.append(token.lemma_)
nodeAtts.append(token.pos_)
unitList.append(unit)
synsetCounts.append(synsets)
data = {
'word': wordList,
'nodeType': nodeAtts,
'unit': unitList,
'synset' : synsetCounts}
df = pd.DataFrame(data)
return df
allDataFrames = []
for file in os.listdir(collPath):
if file.endswith("Sentences.txt"):
filepath = f"{collPath}/{file}"
name, extension = os.path.splitext(file)
with open(filepath, 'r', encoding='utf8') as f:
readFile = f.read()
spacyRead = nlp(readFile)
myDataFrame = wordCollector(spacyRead, name)
allDataFrames.append(myDataFrame)
# Make an output filepath
outputFilePath = 'adjFILTERED.tsv'
# Turn the list of dataframes into one dataframe:
fullDataFrame = pd.concat(allDataFrames, ignore_index=True)
fullDataFrame.to_csv(outputFilePath, sep='\t', index=False)
print('I just saved a dataframe as a TSV file.')
# Go check your filestash for the file.
Regex Key // regKey.md
## Regex Steps
# Lovecraft
1. Added `<xml>` over entire document.
2. Found `(^\n*(.+\n(?:.+\n)*)` and replaced it with `<para>$1</para>` to divide work into paragraphs
3. Found Gutenberg headings, titles, and beginning quotes from other authors and wrapped them in a
`<meta>` tag. Changed `<para>` tags to `<info>` tags within these for clarity.
4. Found `(?<=<para>)([^<]+?)([.!?])` and wrapped in `<s>` tag for first sentence. Oxygen
didn't play nice with the para tags at the beginning, so this was necessary.
5. Marked the rest of the `<s>` tags by using `(?<=</s>)([^<]+?)([.!?])`. Had to
continually find and replace in order to get everything in a paragraph, as this simply
targeted the next sentence.
6. Found `"(.*?)"` to find items in quotes and wrap them in a `quote` tag.
** Note: most paragraphs that consist *entirely* of quotes do not have `<s>` tags.
I figured it could be valuable to have paragraphs with only quotations as a
datapoint, so I didn't remedy this.**
# Poe
1. Most everything is the same as with the Lovecraft edits, however,
Poe's use of single quotation marks for dialogue (i.e., "Hello blah blah blah,
continue onto next line...) instead of wrapping them in quotations made the quotes
a little harder to track. Some provisions were taken by only looking for one quote,
but some may have fell through the cracks.