Solution of my case study consists of a Python script to retrieve and transform the data, a SQL script for creation of the “twitter” database and this page to present the result of the analysis.

All scripts and data can be found under: https://github.com/JaroslavKotrba/TwitterDiePresse

1 Connect to database “twitter”

# install.packages("RMySQL")
library(RMySQL)
mydb = dbConnect(MySQL(), user='Jaroslav_Kotrba', password='milites', dbname='twitter', host='localhost')
dbListTables(mydb)

## [1] "frequency" "length"    "name"      "tweet"

# name
rs = dbSendQuery(mydb, "select * from name;")
name = fetch(rs, n = -1)
dbClearResult(rs)

## [1] TRUE

# tweet
rs = dbSendQuery(mydb, "select * from tweet")
tweet = fetch(rs, n = -1)
dbClearResult(rs)

## [1] TRUE

# length
rs = dbSendQuery(mydb, "select * from length")
length = fetch(rs, n = -1)
dbClearResult(rs)

## [1] TRUE

# frequency
rs = dbSendQuery(mydb, "select * from frequency")
frequency = fetch(rs, n = -1)
dbClearResult(rs)

## [1] TRUE

2 Summary data

# install.packages("tidyverse")
library(tidyverse)
data <- name %>%
    left_join(tweet, by='ID') %>%
    left_join(length, by='ID')

# Summary data
summary(subset(data, select = c(NAME, TWEET, LENGTH)))

##      NAME              TWEET               LENGTH     
##  Length:675         Length:675         Min.   : 26.0  
##  Class :character   Class :character   1st Qu.:129.5  
##  Mode  :character   Mode  :character   Median :137.0  
##                                        Mean   :158.7  
##                                        3rd Qu.:186.0  
##                                        Max.   :405.0

3 TOP ten most active users

library(tidyverse)
NAME_groupby <- data.frame(data %>% group_by(NAME) %>% 
                            summarise(
                              LENGTH_of_tweet_average = mean(LENGTH),
                              freq = n()
                            ))
head(NAME_groupby[order(-NAME_groupby$freq),],10)

##                NAME LENGTH_of_tweet_average freq
## 403           MLGG2                245.8333    6
## 405        ModhoMia                171.0000    6
## 398 MingVaseDynasty                225.0000    4
## 114      Cdoglover1                193.0000    3
## 77          Big_DMB                137.0000    2
## 102    CaliChick777                127.0000    2
## 147  DawnRoseTurner                157.0000    2
## 259 inhenyerangBula                228.5000    2
## 295 Johnnygfrmbklyn                137.0000    2
## 333  La_Femme_Nista                112.0000    2

viz <- head(NAME_groupby[order(-NAME_groupby$freq),],10)
# install.packages("ggplot2")
library(ggplot2)
plot <- ggplot(viz, aes(x = freq, y = reorder(NAME, freq))) +
  geom_col(stat='identity', color="cornflowerblue", fill="white") +
  geom_text(aes(label=freq), vjust=0.5, hjust=1.8, color="cornflowerblue", size=3.5) +
  ylab('NAME') +
  xlab('COUNT') +
  theme_bw()
plot

4 TOP ten longest tweets

head(data[order(-data$LENGTH),],10)

##      ID            NAME
## 325 325  williamlies247
## 95   95 tellmeofyourho1
## 47   47     hinz_tamara
## 447 447 America21568421
## 173 173 VanessaFuchsArt
## 442 442     Titus_Roach
## 93   93        EZembeck
## 645 645    NoahKamara99
## 355 355         Orenar4
## 503 503           MLGG2
##                                                                                                                                                                                                                                                                                                                                                                                                                           TWEET
## 325 rookscooper @teamtexasam @ku_football @texasfootball @sec @alabamaftbl @aggiefootball @big12conference number *4 in a pandemic shortened season by beating two teams over .500 with absolutely zero to show for it. beating your daddy bama isn\x92t greater than a conference title game! do you even realize how pathetic you sound? once again zero natty\x92s and zero conference titles (1998) https://t.co/l0b7pg9bcw
## 95                                                                   rashellc @freemymankyle @biancaskullmask @cerseile @leaveitalonekim @stat_butler cause it was a global pandemic that spread even to the vatican, the holy see in 4 months.  it's flourished in 3rd world countries due to first world nations hoarding vaccines.  that allows it to mutate &amp; further plague us in waves.  i'm questioning your sanity.
## 47                                                                    lculated2atee @merz @drleanawen @helenfa92229708 @denise_dewald child psychiatrist here \nmasking is so far down the list of pandemic related concerns of kids that i treat that i don\x92t think it would even make the top 10.\nschool disruptions, deaths of relatives, worries about getting sick, mourning cancelled activities all rank far higher.
## 447                                                                             bertvonb1 @dosenbachken @hcrabber @scottpresler claiming i "don't realize 2019 isn't 2020\x94 is relevant because biden is saving us? saving us from what? he's the only president to hit record inflation &amp; nearly record high gas prices. this could've been avoided. prices never reached this under trump, in the middle of a pandemic.
## 173                                                                                    ry_cee @crimsonartist @missigraine @kateemerson88 especially when their sacrifice was to get a jab and wear a hood mask. so easy! this stage of the pandemic is really revealing how easy it is for governments to disregard rights of disabled/high risk. and presented it in a way that allows able bodied to label us as over anxious
## 442                                                                                  njycoon @hyper_high @hibernaculum23 @prisonplanet the who. 80 to 85% of ppl will experience no symptoms. 10 to 15% mild symptoms. 5% or less, severe symptoms and a fraction of that 5% will face mortality. \n\nwe have never been in a pandemic. humans have been mass tested for coronaviruses. which are common and change constantly.
## 93                                                                               rkchaos @rightsof_man @drleanawen i live in heavy blue state ma, it\x92s really the prevailing view of many who followed the rules &amp; got vax\x92d. it\x92s shifted to a matter of individual risk assessment imo. throughout the pandemic i\x92ve always thought @scottgottliebmd &amp; @ashishkjha have provided the most pragmatic views
## 645                                                                                         ck_wilders @jt100uk @deldawna @presssec i\x92m not moving the goalposts. the person i replied to claimed vaccines were 100% effective before the pandemic which is just not true.\nthe salk vaccine was only 60-70% effective against pv1 in the study conducted at that time. it did however prevent bulbar polio in 94% of cases.
## 355                                                                                               rgasmilan1 @darkagerave @lokijulianus loki had some excellent calls with regard to covid at the beginning of the pandemic. i believe it was he who first shared the hcq paper by raoult. i don't believe he is a patsy. i do however believe that rw twitter has gotten caught up in binary thinking surrounding geopolitics.
## 503                                                                                                      shprime a pandemic should be managed by epidemiologists with knowledge on pandemics. when are we going to hold accountable people who, without training or accountability, are doing so much harm? @patientombuds @cpso_ca @covidsciontario @opha_ontario @ottawahealthlaw @uoftnews @drjfrank https://t.co/vvyy3cozxc
##     LENGTH
## 325    405
## 95     346
## 47     340
## 447    332
## 173    328
## 442    328
## 93     322
## 645    319
## 355    317
## 503    310

x <- na.omit(data$LENGTH)
SD <- sd(x)
mean.auta <- mean(x)
hist(x, breaks = 20, density = 20, prob=TRUE,
     main="Distribution of tweets character length",
     xlab="COUNT",
     ylab="DENSITY",
     cex.lab=1.2)
quant <- seq(min(x),max(x),length=100)
normaldens <- dnorm(quant,mean=mean.auta,sd=SD)
lines(quant,normaldens,col="red",lwd=2)
lines(density(x), col="cornflowerblue",lwd=2)
legend("topright",c("normal distribution","observed distribution"),lty=c(1,1),
       col=c("red","cornflowerblue"),lwd=2)

5 Summary words

words <- frequency
summary(subset(words, select = c(WORD, COUNT)))

##      WORD               COUNT        
##  Length:3775        Min.   :  1.000  
##  Class :character   1st Qu.:  1.000  
##  Mode  :character   Median :  1.000  
##                     Mean   :  2.403  
##                     3rd Qu.:  2.000  
##                     Max.   :660.000

6 TOP ten words

head(subset(words, select = c(WORD, COUNT))[order(-words$COUNT),],10)

##        WORD COUNT
## 1  pandemic   660
## 2    people    63
## 3     covid    62
## 4     years    47
## 5       amp    44
## 6   covid19    43
## 7      like    41
## 8         2    40
## 9      dont    34
## 10    biden    32

viz <- head(subset(words, select = c(WORD, COUNT))[order(-words$COUNT),],10)
# install.packages("ggplot2")
library(ggplot2)
plot <- ggplot(viz, aes(x = COUNT, y = reorder(WORD, COUNT))) +
  geom_col(stat='identity', color="cornflowerblue", fill="white") +
  geom_text(aes(label=COUNT), vjust=0.5, hjust=1.25, color="cornflowerblue", size=3.5) +
  ylab('WORD') +
  theme_bw()
plot

7 Words associated with “Pandemic”

# install.packages("wordcloud2")
library(wordcloud2)
viz <- head(subset(words, select = c(WORD, COUNT)), 1000)
wordcloud2(data=viz, size = 4, color='random-dark', backgroundColor = "White")

Created by: Jaroslav Kotrba
https://jaroslavkotrba.com

Die Presse Case Study for Data Engineer

Jaroslav Kotrba

March 28, 2022