### importing the libraries which will be used through the project
import pandas as pd
import numpy as np
import requests
import tweepy
import json
#show the whole cell
pd.set_option('display.max_colwidth', -1)
in this section we will collect data from 3 different places to analyze dogs ratings data
# loading the archive data into local_df and showing the head
local_df = pd.read_csv('twitter-archive-enhanced.csv')
local_df.head(7)
#loading the pics neural network data using requests into image_df
#url
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
#getting the file and saving it
response = requests.get(url)
with open('image_predictions.tsv', mode = 'wb') as f :
f.write(response.content)
f.close()
#load it into image_df dataframe and showing the header
image_df = pd.read_csv('image_predictions.tsv', sep = '\t')
image_df.head(7)
#downloading tweets json to tweet_json.txt
#note that I faced too many problems running this chunk due to my bad connection
#so I put it here and then used the ready file in the next chunk
with open('keys.txt', 'r') as f : ## a file I saved the keys in as a json object(not included in the submission)
keys = json.load(f)
f.close()
auth = tweepy.AppAuthHandler(keys['key'], keys['secret'])
api = tweepy.API(auth, wait_on_rate_limit=True)
tweet_ids = local_df['tweet_id'].values
with open('tweet_json.txt', 'w') as outfile :
for tweet_id in tweet_ids:
try :
tweet = api.get_status(tweet_id, tweet_mode='extended')
json.dump(tweet._json, outfile)
outfile.write('\n')
except :
print(tweet_id)
pass
outfile.close()
#load retweets and likes
with open('tweet_json.txt', 'r') as f :
elements = [] #list of dictionaries to transform later to a dataframe
for line in f :
observation = json.loads(line)
element = {} #temporary element to save the data in the for loop
element['tweet_id'] = observation['id']
element['rt_count'] = observation['retweet_count']
element['fav_count'] = observation['favorite_count']
elements.append(element)
f.close()
#create the data frame and show its info
json_df = pd.DataFrame(elements)
json_df.info()
#local dataframe
local_df.sample(10)
#image dataframe
image_df.sample(10)
#API dataframe
#move id to first column
cols = ['tweet_id', 'rt_count', 'fav_count']
json_df = json_df[cols]
json_df.sample(10)
the api collected data looks clean visually
# show info about the local_df fataframe
local_df.info()
as mentioned before there are retweets needs to be deleted and there are missing values in expanded url
new issue : timestamp is a string not a date (consistency)
#summary of numeric variables local_df
local_df.describe()
#count of ratings with a denominator not equal to 10
local_df[local_df['rating_denominator'] != 10].shape[0]
#show tweets with denominator != 10 to study them
local_df[local_df['rating_denominator'] != 10][['tweet_id', 'retweeted_status_id', 'text']]
contains dates: (740373189193256964, 14), (722974582966214656, 13)
retweets : 775096608509886464
bunch of dogs: 820690176645140481, 758467244762497024, 731156023742988288, 713900603437621249, 710658690886586372, 709198395643068416, 704054845121142784, 697463031882764288, 684225744407494656, 684222868335505415, 677716515794329600, 675853064436391936
impossible to get : 832088576586297345, 810984652412424192, 686035780142297088, 682808988178739200
wrong parsing : (835246439529840640, 13), (716439118184652801, 11), (682962037429899265, 10), (666287406224695296, 9)
note : the retweets will be deleted later and all ratings will be normalized to a new scale for analysis so we don't need to edit the "bunch of dogs" ratings
tidiness issue : numenator and denominator both represents one variable (rating)
#outliers in numinators
local_df.query('rating_numerator / rating_denominator > 2')[['tweet_id', 'text']]
wrong parsing : (786709082849828864, 10), (778027034220126208, 11), (680494726643068929, 11)
previously mentioned : 835246439529840640, 810984652412424192
overrated (not good for analytical models): 855862651834028034, 855860136149123072, 838150277551247360, 749981277374128128, 670842764863651840
retweets : 832215909146226688
note : looks like fractions aren't well parsed here so let's have a look
#ratings with fractions
local_df[local_df.text.str.contains('[0-9]\.[0-9]')][['tweet_id', 'text', 'rating_numerator']]
previously mentioned : 832215909146226688, 786709082849828864, 778027034220126208, 680494726643068929
wrong parsing : (883482846933004288, 14), (681340665377193984, 10)
modify the wrong parsed tweets to a rounded value
#repeated tweets
local_df.tweet_id.nunique()
consistency issue : edit all names to be something like "Sam"
image_df.info()
#repeated observations
image_df.tweet_id.nunique()
#p > 1
image_df.query('p1_conf > 1 | p2_conf > 1 |p3_conf > 1')[['p1_conf', 'p2_conf', 'p3_conf']]
nothing to add here , same goes with the api collected data
#repeated observations in parsed tweets
json_df.tweet_id.nunique()
all quality issues are in the local_df dataframe
the retweets issue can be classified as tidiness issue since our table unit is tweets not retweets, but in this problem after merging we will study dogs ratings not the tweets itself so retweets are considered repeated (invalid) values
the steps here will be done in the code chunks below, each step will have 2 chunks with its number in a comment (one for coding and one for testing)
I began with completeness issues and after that I should look at consistency but the order is a little different since there are steps depending on other steps.
example : rating column depends on modifying ratings
#1 , 2
#copy data
local_df_copy = local_df.copy()
image_df_copy = image_df.copy()
json_df_copy = json_df.copy()
#extract the real source source variable in local_df_copy
local_df_copy['source'] = local_df_copy['source'].map(lambda x : x[x.find('>') + 1: x.rfind('<')])
#1, 2 Test
local_df_copy.sample(5)['source']
#3
#remove the expanded urls column
local_df_copy.drop(axis = 1, labels = 'expanded_urls', inplace = True)
#3 test
local_df_copy.info()
#4 delete the retweets
local_df_copy.drop(local_df_copy[local_df_copy['retweeted_status_id'].notna()].index, inplace = True)
#4 test
local_df_copy.info()
#drop retweet columns
local_df_copy.drop(axis = 1,
labels = ['retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'],
inplace = True)
local_df_copy.info()
#5 delete impossible to get and overrating tweets
to_delete_ids = [832088576586297345, 810984652412424192, 686035780142297088, 682808988178739200, 855862651834028034, 855860136149123072, 838150277551247360, 749981277374128128, 670842764863651840]
local_df_copy.drop(local_df_copy[local_df_copy['tweet_id'].isin(to_delete_ids)].index, inplace = True)
#5 test values of numerator
local_df_copy.query('rating_numerator > 2 * rating_denominator')['text']
#5 test values of denominator
local_df_copy.query('rating_denominator != 10')['text']
all tweets with overratings or without a rating is now deleted
#6 modifying timestamp
local_df_copy['timestamp'] = pd.to_datetime(local_df_copy['timestamp'])
#6 test
local_df_copy.info()
#7 modify wrong ratings
#wrong parsed ids and the correction of them (11 observations)
wrong_parsing_ids = [(740373189193256964, 14),
(722974582966214656, 13),
(835246439529840640, 13),
(716439118184652801, 11),
(682962037429899265, 10),
(666287406224695296, 9),
(786709082849828864, 10),
(778027034220126208, 11),
(680494726643068929, 11),
(883482846933004288, 14),
(681340665377193984, 10)]
for tweet, numerator in wrong_parsing_ids :
print('id : {}\nnumerator:{}'.format(tweet, numerator))
local_df_copy.replace(to_replace = local_df_copy[local_df_copy['tweet_id'] == tweet],
value = {'rating_numerator' : numerator, 'rating_denominator' : 10},
inplace = True)
#7 test
check_ids = []
for tweet, _ in wrong_parsing_ids :
check_ids.append(tweet)
local_df_copy[local_df_copy.tweet_id.isin(check_ids)][['tweet_id', 'rating_numerator', 'rating_denominator']]
#8 modify dog names
local_df_copy['name'] = local_df_copy['name'].str.lower().str.strip().str.capitalize()
local_df_copy.replace({'name' : 'None'}, {'name' : 'No Name'}, inplace = True)
#8 test names
local_df_copy.sample(20)['name']
#8 test2
np.sort(local_df_copy['name'].unique())
new issues to iterate over : the values ['A', 'An', 'All', 'Actually', 'Bookstore'] are invalid
cleaning process define : replace ['A', 'An', 'All', 'Actually', 'Bookstore'] with No Name in names
# replace ['A', 'An', 'All', 'Actually', 'Bookstore'] with No Name in names
local_df_copy.replace(['A', 'An', 'All', 'Actually', 'Bookstore', 'Unacceptable'], 'No Name', inplace = True)
#test it
assert local_df_copy.query("name in ['A', 'An', 'All', 'Actually', 'Bookstore', 'Unacceptable']").shape[0] == 0
#9 edit types columns in local_df (doggo, floofer, pupper, puppo)
local_df_copy.replace({'doggo' : 1,
'floofer' : 1,
'pupper' : 1,
'puppo' : 1,
'None' : 0}, inplace = True)
#9 test
local_df_copy.info()
#9 test2
local_df_copy.doggo.unique()
the process of making one row is excluded since we can use those dummy variables in the analysis in a better way than one column
ps : the wide data approach is better in this case
#11 overall_rating column (will be used in any analysis) - = (num/den) * 10-
local_df_copy['overall_rating'] = (local_df_copy['rating_numerator'].divide(local_df_copy['rating_denominator']) * 10).round().astype(int)
#11 test
local_df_copy.info()
#11 test2
local_df_copy['overall_rating'].describe()
new issues to iterate over :
#zero ratings
local_df_copy.query('overall_rating == 0')[['tweet_id', 'rating_numerator', 'rating_denominator', 'text']]
they are valid and accurate but needs to be removed for consistency (outliers)
#delete tweets 835152434251116546, 746906459439529985
local_df_copy.drop(local_df_copy.query('overall_rating == 0').index, inplace = True)
#test
assert local_df_copy.query('overall_rating == 0').shape[0] == 0
#12 construct a new column named p_dog in image_df_copy
image_df_copy['p_dog'] = np.zeros(image_df_copy.shape[0], int)
#ids with dog pics
dog_ids = image_df_copy.query('p1_dog | p2_dog | p3_dog')['tweet_id']
for tweet in dog_ids :
image_df_copy.replace(to_replace = image_df_copy[image_df_copy['tweet_id'] == tweet],
value = {'p_dog' : 1},
inplace = True)
#12 testing
image_df_copy.info()
#12 testing2
image_df_copy.sample(10)[['p1_dog', 'p2_dog', 'p3_dog', 'p_dog']]
#12 testing3
assert image_df_copy.query('p1_dog | p2_dog | p3_dog')['p_dog'].equals(image_df_copy.query('p_dog == 1')['p_dog'])
#13 delete rows with p_dog = 0
image_df_copy.drop(image_df_copy[image_df_copy['p_dog'] == 0].index, inplace = True)
#13 test
assert image_df_copy.query('p_dog == 0').shape[0] == 0
#14 breed column
#change p_dog value to match the most trustable dog prediction
image_df_copy.replace(to_replace = image_df_copy.query('p3_dog'),
value = {'p_dog' : 3},
inplace = True)
image_df_copy.replace(to_replace = image_df_copy.query('p2_dog'),
value = {'p_dog' : 2},
inplace = True)
image_df_copy.replace(to_replace = image_df_copy.query('p1_dog'),
value = {'p_dog' : 1},
inplace = True)
#test first step
image_df_copy['p_dog'].unique()
#step 2 subsetting the image dataframe to 3 subsets and filling the breed column in each of them
p1_dog = image_df_copy.query('p_dog == 1')
p2_dog = image_df_copy.query('p_dog == 2')
p3_dog = image_df_copy.query('p_dog == 3')
p1_dog['breed'] = p1_dog['p1']
p2_dog['breed'] = p2_dog['p2']
p3_dog['breed'] = p3_dog['p3']
#merge them together
image_df_copy_2 = p1_dog.merge(p2_dog, how = 'outer').merge(p3_dog, how = 'outer')
#testing that it has the same number of observations of the main one
assert image_df_copy_2.shape[0] == image_df_copy.shape[0]
# study the breed column
np.sort(image_df_copy_2['breed'].unique())
# edit the breeds and capitalize them
image_df_copy_2['breed'] = image_df_copy_2['breed'].str.replace('_', ' ').str.capitalize()
#test the edit
np.sort(image_df_copy_2['breed'].unique())
#15 merging them all together
#first step : drop unneeded columns
local_df_copy = local_df_copy[['tweet_id', 'name', 'timestamp', 'doggo', 'floofer', 'pupper', 'puppo', 'overall_rating']]
image_df_copy = image_df_copy_2[['tweet_id', 'breed']] #no modification needed for the api data
local_df_copy.info()
image_df_copy.info()
json_df_copy.info()
#merge step
master_df = local_df_copy.merge(image_df_copy, how = 'inner', on = 'tweet_id').merge(json_df_copy, how = 'left', on = 'tweet_id')
#test it
master_df.info()
dogs names can be better than it is now since there are same names with different spellings(consistency), this note was observed in the first iteration
now time to solve it
#showing names values
np.sort(master_df['name'].unique())
{'Billl' : 'Billy', 'Blu' : 'Blue', 'Bo' : 'Bob', 'Bobb' : 'Bob', 'Bobbay' : 'Bob', 'Bobble' : 'Bob', 'Bradlay' : 'Bradley', 'Brady' : 'Bradley', 'Brandi' : 'Brandy', 'Butters' : 'Butter', 'Callie' : 'Cali', 'Cal' : 'Cali', 'Carll' : 'Carl', 'Churlie' : 'Charlie', 'Clarq' : 'Clarc', 'Craig' : 'Creg', 'Davey' : 'Dave', 'Emmie' : 'Emmy', 'Eriq' : 'Erik', 'Evy' :'Eve', 'Filup' : 'Fillup', 'Frankie' : 'Frank', 'Franq' : 'Frank', 'Frönq' : 'Frank', 'Harold' : 'Harnold', 'Incredibly' : 'No Name', 'Jackie' : 'Jack', 'Jazzy' : 'Jazz', 'Jeffri' : 'Jeffrey', 'Juckson' : 'Jackson', 'Jockson' : 'Jackson', 'Kaiya' : 'Kaia', 'Lassie' : 'Lacy', 'Laela' : 'Laila', 'Layla' : 'Laila', 'Leela' : 'Laila', 'Lilah' : 'Laila', 'Lili' : 'Laila', 'Lilli' : 'Laila', 'Lillie' : 'Laila', 'Lilly' : 'Laila', 'Lily' : 'Laila', 'Livvie' : 'Levi', 'Lulu' : 'Lolo', 'Mack' : 'Mac', 'Maks' : 'Max', 'Millie' : 'Miley', 'Marlee' : 'Marley', 'Mollie' : 'Molly', 'My' : 'No Name', 'Mya' : 'Maya', 'Not' : 'No Name', 'O' : 'No Name', 'Obi' : 'Obie', 'Oliviér' : 'Oliver', 'Ole' : 'Ollie', 'One' : 'No Name', 'Ozzy' : 'Ozzie', 'Ralph' : 'Ralf', 'Quite' : 'No Name', 'Rolf' : 'Ralf', 'Samson' : 'Sampson', 'Sammy' : 'Sam' 'Scout' : 'Scott', 'Shadoe' : 'Shadow', 'Skye' : 'Sky', 'Snoopy' : 'Snoop', 'Sparky' : 'Spark', 'Stefan' : 'Stephan', 'Tayzie' : 'Taz', 'The' : 'No Name', 'Tobi' : 'Toby', 'Traviss' : 'Travis', 'Trevith' : 'Travis', 'Tripp' : 'Trip', 'Vinscent' : 'Vincent', 'Very' : 'No Name', 'Willie' : 'Willy', 'Willow' : 'Willy', 'Zeek' : 'Zeke', 'Zooey' : 'Zoey', 'Zoe' : 'Zoey'}
we will now replace the values in this dictionary with the right ones
master_df.replace({'Billl' : 'Billy',
'Blu' : 'Blue',
'Bo' : 'Bob',
'Bobb' : 'Bob',
'Bobbay' : 'Bob',
'Bobble' : 'Bob',
'Bradlay' : 'Bradley',
'Brady' : 'Bradley',
'Brandi' : 'Brandy',
'Butters' : 'Butter',
'Callie' : 'Cali',
'Cal' : 'Cali',
'Carll' : 'Carl',
'Churlie' : 'Charlie',
'Clarq' : 'Clarc',
'Craig' : 'Creg',
'Davey' : 'Dave',
'Emmie' : 'Emmy',
'Eriq' : 'Erik',
'Evy' :'Eve',
'Filup' : 'Fillup',
'Frankie' : 'Frank',
'Franq' : 'Frank',
'Frönq' : 'Frank',
'Harold' : 'Harnold',
'Incredibly' : 'No Name',
'Jackie' : 'Jack',
'Jazzy' : 'Jazz',
'Jeffri' : 'Jeffrey',
'Juckson' : 'Jackson',
'Jockson' : 'Jackson',
'Kaiya' : 'Kaia',
'Lassie' : 'Lacy',
'Laela' : 'Laila',
'Layla' : 'Laila',
'Leela' : 'Laila',
'Lilah' : 'Laila',
'Lili' : 'Laila',
'Lilli' : 'Laila',
'Lillie' : 'Laila',
'Lilly' : 'Laila',
'Lily' : 'Laila',
'Livvie' : 'Levi',
'Lulu' : 'Lolo',
'Mack' : 'Mac',
'Maks' : 'Max',
'Millie' : 'Miley',
'Marlee' : 'Marley',
'Mollie' : 'Molly',
'My' : 'No Name',
'Mya' : 'Maya',
'Not' : 'No Name',
'O' : 'No Name',
'Obi' : 'Obie',
'Officially' : 'No Name',
'Oliviér' : 'Oliver',
'Ole' : 'Ollie',
'One' : 'No Name',
'Ozzy' : 'Ozzie',
'Ralph' : 'Ralf',
'Quite' : 'No Name',
'Rolf' : 'Ralf',
'Samson' : 'Sampson',
'Sammy' : 'Sam',
'Scout' : 'Scott',
'Shadoe' : 'Shadow',
'Skye' : 'Sky',
'Snoopy' : 'Snoop',
'Sparky' : 'Spark',
'Stefan' : 'Stephan',
'Tayzie' : 'Taz',
'The' : 'No Name',
'Tobi' : 'Toby',
'Traviss' : 'Travis',
'Trevith' : 'Travis',
'Tripp' : 'Trip',
'Vinscent' : 'Vincent',
'Very' : 'No Name',
'Willie' : 'Willy',
'Willow' : 'Willy',
'Zeek' : 'Zeke',
'Zooey' : 'Zoey',
'Zoe' : 'Zoey'}, inplace = True)
#test the new names
np.sort(master_df['name'].unique())
#save the dataframe to twitter_archive_master.csv
master_df.to_csv('twitter_archive_master.csv')
#import pyplot
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
#barplot of the 10 most common names in the dataset
plt.figure()
master_df.query('name != "No Name"').name.value_counts().head(10).plot.bar(figsize = (10, 10))
plt.title('most common dog names according to weRateDogs account', fontsize = 'xx-large')
plt.xlabel('name', fontsize = 'large')
plt.ylabel('occurings in the dataset', fontsize = 'large')
sns.despine();
most used dog names in the data-set :
#histogram of the overall_rating
plt.figure()
master_df.overall_rating.hist(bins = 12)
sns.despine()
plt.xlabel('rating (out of 10)', fontsize = 'large')
plt.ylabel('frequency', fontsize = 'large')
plt.title('histogram of dog ratings in WeRateDogs', fontsize = 'xx-large');
insights : the graph is so skewed to the left with a peak on 12/10
This was expected since the users tend to rate the dogs more than 10 (coz it's fun :D)
let's limit the results to start from 9
#histogram of the overall_rating > 8
plt.figure()
master_df.query('overall_rating > 8').overall_rating.hist(bins = 6)
sns.despine()
plt.xlabel('rating (out of 10)', fontsize = 'large')
plt.ylabel('frequency', fontsize = 'large')
plt.title('histogram of dog ratings in WeRateDogs', fontsize = 'xx-large');
so the most common rating is 12
let's calculate the mean rating
#mean rating
master_df.overall_rating.describe()
the average rating is 10.85 and the median is 11
Now, I'll look at the relation between rating and other variables
#most common breeds
plt.figure()
master_df.breed.value_counts().head(10).plot.bar(figsize = (10, 10))
plt.title('most common dog breeds according to weRateDogs account', fontsize = 'xx-large')
plt.xlabel('breed', fontsize = 'large')
plt.ylabel('occurings in the dataset', fontsize = 'large')
sns.despine();
most common dog breeds in the data-set :
Exciting ! now time to see the highest and lowest rated dog breed
#using groupby to see highest and lowest rated dog on average breeds
master_df.groupby('breed').mean()['overall_rating'].sort_values()
#count of Bouvier des flandres dogs
master_df.query('breed == "Bouvier des flandres"').shape[0]
#counts of all breeds
#using groupby to see highest and lowest rated dog on average breeds
master_df.groupby('breed', observed = True).count()['tweet_id'].sort_values()
#calculate lowest and highest rating for breeds only occured more than mean occurance frequence
#calculate the mean
master_df.groupby('breed', observed = True).count()['tweet_id'].mean()
#require occurencies starting from 15 and repeat the first step (for highr confidence)
master_df.groupby('breed').agg({'tweet_id' : np.size, 'overall_rating' : np.mean}).query('tweet_id >= 15')['overall_rating'].sort_values()
highest rated breeds are Samoyed, Golden retriever and Great pyrenees
lowest rated breed is Soft-coated wheaten terrier
#doggo average rating
master_df.query('doggo')['overall_rating'].mean()
#floofer average rating
master_df.query('floofer')['overall_rating'].mean()
#pupper average rating
master_df.query('pupper')['overall_rating'].mean()
#puppo average rating
master_df.query('puppo')['overall_rating'].mean()
no insights here, values are so close
I kept those 4 variables to try expecting the rating using them in a linear model but from those results I feel like this model is not going to be good enough
I'll keep them anyways for now
#scatterplot function
def scatter(df, x, y, xlabel, ylabel, title) :
plt.figure(figsize = (8, 8))
sns.swarmplot(df[x], df[y])
#df.plot(x, y, kind = 'scatter', figsize = (8, 8), alpha = 0.5)
plt.xlabel(xlabel, fontsize = 'large')
plt.ylabel(ylabel, fontsize = 'large')
plt.title(title, fontsize = 'xx-large')
#relation between retweets and rating
scatter(master_df, 'overall_rating', 'rt_count', 'rating(out of 10)', 'retweets', 'retweets vs rating')
#relation between favorites and rating
scatter(master_df, 'overall_rating', 'fav_count', 'rating(out of 10)', 'favorites', 'favorites vs rating')
and yes, looks like the higher the rating is, the more retweets and favorites it grabs
future plan : build a linear model to predict dog rating based on retweets, favorites and/or type keywords