In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

internet_usage = pd.read_csv('internet_session.csv', parse_dates=['start_time'])
In [2]:
internet_usage.head()
Out[2]:
name start_time usage_time IP MAC upload download total_transfer seession_break_reason
0 user1 2022-05-10 02:59:32 00:00:36:28 10.55.14.222 48:E7:DA:58:22:E9 15861.76 333168.64 349030.40 Idle-Timeout
1 user1 2022-05-10 18:53:27 00:01:49:56 10.55.2.253 48:E7:DA:58:22:E9 16957.44 212152.32 229109.76 Idle-Timeout
2 user1 2022-05-10 21:20:44 00:01:35:00 10.55.2.253 48:E7:DA:58:22:E9 14080.0 195153.92 209233.92 Idle-Timeout
3 user1 2022-05-11 00:37:42 00:00:26:00 10.55.2.253 48:E7:DA:58:22:E9 5242.88 40806.4 46049.28 Idle-Timeout
4 user1 2022-05-11 02:59:38 00:00:11:52 10.55.2.253 48:E7:DA:58:22:E9 22067.2 10772.48 32839.68 Idle-Timeout
In [3]:
internet_usage.dtypes
Out[3]:
name                             object
start_time               datetime64[ns]
usage_time                       object
IP                               object
MAC                              object
upload                           object
download                         object
total_transfer                  float64
seession_break_reason            object
dtype: object
In [4]:
internet_usage.shape
Out[4]:
(4712, 9)
In [5]:
internet_usage.columns
Out[5]:
Index(['name', 'start_time', 'usage_time', 'IP', 'MAC', 'upload', 'download',
       'total_transfer', 'seession_break_reason'],
      dtype='object')
In [6]:
internet_usage.columns = internet_usage.columns.str.lower()
internet_usage.columns
Out[6]:
Index(['name', 'start_time', 'usage_time', 'ip', 'mac', 'upload', 'download',
       'total_transfer', 'seession_break_reason'],
      dtype='object')
In [7]:
internet_usage.isna().sum()
Out[7]:
name                     0
start_time               0
usage_time               0
ip                       0
mac                      0
upload                   0
download                 0
total_transfer           0
seession_break_reason    9
dtype: int64
In [9]:
internet_usage = internet_usage.dropna().copy()
internet_usage.isna().sum()
Out[9]:
name                     0
start_time               0
usage_time               0
ip                       0
mac                      0
upload                   0
download                 0
total_transfer           0
seession_break_reason    0
dtype: int64
In [10]:
internet_usage.duplicated().sum()
Out[10]:
0
In [25]:
internet_usage.dtypes
Out[25]:
name                             object
start_time               datetime64[ns]
usage_time               datetime64[ns]
ip                               object
mac                              object
upload                          float64
download                        float64
total_transfer                  float64
seession_break_reason            object
dtype: object
In [27]:
internet_usage.describe(include ='all', datetime_is_numeric = True)
Out[27]:
name start_time usage_time ip mac upload download total_transfer seession_break_reason
count 4703 4703 4703 4703 4703 4.703000e+03 4.703000e+03 4.703000e+03 4703
unique 9 NaN NaN 1299 33 NaN NaN NaN 5
top user4 NaN NaN 10.55.0.89 48:E7:DA:58:22:E9 NaN NaN NaN Idle-Timeout
freq 725 NaN NaN 80 1235 NaN NaN NaN 4350
mean NaN 2022-08-08 09:35:44.875185920 2023-01-08 02:10:05.038486272 NaN NaN 3.378702e+04 3.966645e+05 4.304372e+05 NaN
min NaN 2022-05-09 22:52:41 2023-01-08 00:00:01 NaN NaN 2.000000e+00 9.000000e+00 1.120000e+00 NaN
25% NaN 2022-06-14 18:33:06.500000 2023-01-08 00:31:42 NaN NaN 6.082000e+03 5.199800e+04 6.187008e+04 NaN
50% NaN 2022-08-19 13:56:28 2023-01-08 01:19:40 NaN NaN 1.531900e+04 1.782680e+05 2.027930e+05 NaN
75% NaN 2022-09-24 22:30:58.500000 2023-01-08 02:49:02 NaN NaN 3.399600e+04 4.593660e+05 4.993997e+05 NaN
max NaN 2022-11-05 18:41:14 2023-01-08 22:00:07 NaN NaN 2.841640e+06 2.790261e+07 2.855272e+07 NaN
std NaN NaN NaN NaN NaN 9.493243e+04 9.657778e+05 9.960848e+05 NaN
In [28]:
internet_usage.name.value_counts()
Out[28]:
user4    725
user6    674
user1    673
user9    571
user7    526
user3    518
user2    456
user5    335
user8    225
Name: name, dtype: int64
In [29]:
plt.figure(figsize = (18,9))
ax = sns.countplot(x='name', data = internet_usage)
ax.bar_label(ax.containers[0])
plt.title("User Count")
plt.show()
plt.clf
Out[29]:
<function matplotlib.pyplot.clf()>
In [30]:
print('The earliest start time is:')
print(internet_usage.start_time.min())
print('The latest start time is:')
print(internet_usage.start_time.max())
The earliest start time is:
2022-05-09 22:52:41
The latest start time is:
2022-11-05 18:41:14
In [31]:
print('The earliest usage time is:')
print(internet_usage.usage_time.min())
print('The latest usage time is:')
print(internet_usage.usage_time.max())
The earliest usage time is:
2023-01-08 00:00:01
The latest usage time is:
2023-01-08 22:00:07
In [35]:
print('The minimum usage time per user is:')
usage_time_minimum = internet_usage.groupby('name').usage_time.min()
usage_time_minimum
The minimum usage time per user is:
Out[35]:
name
user1   2023-01-08 00:00:18
user2   2023-01-08 00:00:08
user3   2023-01-08 00:00:01
user4   2023-01-08 00:00:45
user5   2023-01-08 00:01:07
user6   2023-01-08 00:00:18
user7   2023-01-08 00:00:20
user8   2023-01-08 00:00:20
user9   2023-01-08 00:00:09
Name: usage_time, dtype: datetime64[ns]
In [36]:
plt.figure(figsize=(18, 9))
usage_time_minimum.plot(kind='bar', logy=True)
plt.title("The minimum usage time per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [37]:
print('The maximum usage time per user:')
usage_time_maximum = internet_usage.groupby('name').usage_time.max()
usage_time_maximum
The maximum usage time per user:
Out[37]:
name
user1   2023-01-08 19:35:11
user2   2023-01-08 20:39:52
user3   2023-01-08 17:01:28
user4   2023-01-08 18:11:43
user5   2023-01-08 06:36:11
user6   2023-01-08 19:35:11
user7   2023-01-08 22:00:07
user8   2023-01-08 17:24:26
user9   2023-01-08 19:26:09
Name: usage_time, dtype: datetime64[ns]
In [38]:
plt.figure(figsize=(18, 9))
usage_time_maximum.plot(kind='bar', logy=True)
plt.title("The maximum usage time per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [39]:
print("the average usage time per user is:")
usage_time_average = internet_usage.groupby('name').usage_time.mean()
usage_time_average
the average usage time per user is:
Out[39]:
name
user1   2023-01-08 01:42:47.665676032
user2   2023-01-08 01:42:53.866227968
user3   2023-01-08 02:19:42.019305216
user4   2023-01-08 02:38:01.766896640
user5   2023-01-08 01:20:11.701492736
user6   2023-01-08 01:42:49.998516224
user7   2023-01-08 02:17:45.053231872
user8   2023-01-08 04:03:14.555555584
user9   2023-01-08 02:29:32.180385280
Name: usage_time, dtype: datetime64[ns]
In [40]:
plt.figure(figsize=(18, 9))
usage_time_average.plot(kind='bar', logy=True)
plt.title("The average usage time per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [41]:
internet_usage.ip.value_counts()
Out[41]:
10.55.0.89      80
10.55.14.148    64
10.55.15.221    55
10.55.1.50      48
10.55.10.46     44
                ..
10.55.15.44      1
10.55.15.237     1
10.55.14.166     1
10.55.3.200      1
10.55.4.159      1
Name: ip, Length: 1299, dtype: int64
In [ ]:
#most used ip address is 10.55.0.89
In [45]:
device = []
basename = 'device'
mac = internet_usage['mac'][0]
device_number = 1
for i in internet_usage['mac']:
    if i == mac:
        device.append(basename + str(device_number))
    else:
        device_number += 1
        device.append(basename + str(device_number))
        mac = i
internet_usage['device'] = device
In [46]:
internet_usage.device.value_counts()
Out[46]:
device1206    194
device835     137
device11      137
device1212    132
device312     113
             ... 
device582       1
device583       1
device584       1
device585       1
device613       1
Name: device, Length: 1224, dtype: int64
In [ ]:
#most used device is 1206 with 194 times use
In [47]:
print('The minimum upload is :' ,str(internet_usage.upload.min()), 'kb')
print('The maximum upload is :', str(internet_usage.upload.max()), 'kb')
print('The average upload is:', str(round(internet_usage.upload.mean(),2)),'kb')
The minimum upload is : 2.0 kb
The maximum upload is : 2841640.0 kb
The average upload is: 33787.02 kb
In [48]:
print("the minimum upload per user:")
internet_usage.groupby('name').upload.min()
the minimum upload per user:
Out[48]:
name
user1     19.0
user2     23.0
user3     36.0
user4     56.0
user5    382.0
user6     19.0
user7      2.0
user8      4.0
user9     41.0
Name: upload, dtype: float64
In [50]:
plt.figure(figsize = (18,9))
ax = sns.barplot(x='name',y = 'upload', data = internet_usage, ci = None , estimator= np.min)
ax.bar_label(ax.containers[0])
plt.title('Minimum upload per user')
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [51]:
print('The maximum upload per user is:')
internet_usage.groupby('name').upload.max()
The maximum upload per user is:
Out[51]:
name
user1     638566.0
user2     379955.0
user3    1625292.0
user4     754462.0
user5    2841640.0
user6     638566.0
user7     653731.0
user8     709058.0
user9    1352663.0
Name: upload, dtype: float64
In [55]:
plt.figure(figsize = (18,9))
ax = sns.barplot(x ='name',y = 'upload', data = internet_usage, ci = None, estimator = np.max)
ax.bar_label(ax.containers[0])
plt.title('Maximum upload per user:')
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [56]:
print("the average upload per user is")
round(internet_usage.groupby('name').upload.mean(),2)
the average upload per user is
Out[56]:
name
user1    27291.34
user2    29188.79
user3    29594.88
user4    33783.74
user5    64500.35
user6    27700.73
user7    23075.54
user8    59190.12
user9    37944.66
Name: upload, dtype: float64
In [57]:
plt.figure(figsize = (18,9))
ax = sns.barplot(x = 'name', y = 'upload', data = internet_usage , ci = None, estimator = np.mean)
ax.bar_label(ax.containers[0])
plt.title('Average upload per user')
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [58]:
print('the minimum download per user is:')
internet_usage.groupby('name').download.min()
the minimum download per user is:
Out[58]:
name
user1     50.0
user2     49.0
user3     60.0
user4     12.0
user5    461.0
user6     50.0
user7      9.0
user8     13.0
user9     61.0
Name: download, dtype: float64
In [59]:
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='download' , data=internet_usage, ci=None, estimator=np.min)
ax.bar_label(ax.containers[0])
plt.title("Minimum download per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [60]:
print('the maximum download per user is:')
internet_usage.groupby('name').download.max()
the maximum download per user is:
Out[60]:
name
user1    23760732.0
user2    21831352.0
user3     3145728.0
user4     8325693.0
user5     5033164.0
user6    23760732.0
user7    27902607.0
user8     2747269.0
user9     6008340.0
Name: download, dtype: float64
In [61]:
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='download' , data=internet_usage, ci=None, estimator=np.max)
ax.bar_label(ax.containers[0])
plt.title("Maximum download per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [62]:
print('the average download per user is:')
internet_usage.groupby('name').download.mean()
the average download per user is:
Out[62]:
name
user1    270725.964339
user2    573798.015351
user3    342230.372587
user4    408580.259310
user5    357278.077612
user6    270545.182493
user7    453828.606464
user8    341417.124444
user9    578981.506130
Name: download, dtype: float64
In [63]:
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='download' , data=internet_usage, ci=None, estimator=np.mean)
ax.bar_label(ax.containers[0])
plt.title("Average download per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [64]:
print("the minimum total transfer is:" , str(internet_usage.total_transfer.min()), 'kb')
print('the maximum total transfer is:', str(internet_usage.total_transfer.max()),'kb')
print('the average total transfer is: ', str(internet_usage.total_transfer.mean()),'kb')
the minimum total transfer is: 1.12 kb
the maximum total transfer is: 28552724.48 kb
the average total transfer is:  430437.2108228794 kb
In [65]:
print('the minimum total transfer per user is')
internet_usage.groupby('name').total_transfer.min()
the minimum total transfer per user is
Out[65]:
name
user1     75.34
user2     73.13
user3      1.12
user4     13.45
user5    924.40
user6     75.34
user7     15.25
user8     18.08
user9    102.64
Name: total_transfer, dtype: float64
In [66]:
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='total_transfer' , data=internet_usage, ci=None, estimator=np.min)
ax.bar_label(ax.containers[0])
plt.title("Minimum total transfer per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [67]:
print('the maximum total transfer per user is')
internet_usage.groupby('name').total_transfer.max()
the maximum total transfer per user is
Out[67]:
name
user1    24389877.76
user2    22051553.28
user3     3282042.88
user4     8524922.88
user5     5158993.92
user6    24389877.76
user7    28552724.48
user8     3166699.52
user9     6155141.12
Name: total_transfer, dtype: float64
In [69]:
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='total_transfer' , data=internet_usage, ci=None, estimator=np.max
ax.bar_label(ax.containers[0])
plt.title("Maximum total transfer per user:")
plt.show()
plt.clf()
  Input In [69]
    ax.bar_label(ax.containers[0])
    ^
SyntaxError: invalid syntax
In [70]:
print('The average total transfer per user:')
round(internet_usage.groupby('name').total_transfer.mean(), 2)
The average total transfer per user:
Out[70]:
name
user1    297971.21
user2    602904.19
user3    371826.53
user4    442413.51
user5    421772.04
user6    298199.88
user7    476923.04
user8    400682.28
user9    616875.57
Name: total_transfer, dtype: float64
In [71]:
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
ax.bar_label(ax.containers[0])
plt.title("Average total transfer per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [72]:
internet_usage.seession_break_reason.value_counts()
Out[72]:
Idle-Timeout    4350
Lost-Carrier     162
Lost-Service     124
User-Request      65
NAS-Reboot         2
Name: seession_break_reason, dtype: int64
In [ ]:
#max reason is idle-timeout
In [73]:
plt.figure(figsize=(18, 9))
ax = sns.countplot(x='seession_break_reason' , data=internet_usage)
ax.bar_label(ax.containers[0])
plt.title("Session break reason Count")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [74]:
plt.figure(figsize=(18, 9))
ax = sns.countplot(x='name', hue = 'seession_break_reason' , data=internet_usage)
ax.bar_label(ax.containers[0])
plt.title("Session break reason Count per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [ ]:
#What is the most frequent internet activity time of the day ?
In [75]:
internet_usage['hour'] = pd.to_datetime(internet_usage['start_time']).dt.hour
frequent_activity_time_of_day = internet_usage['hour'].value_counts().sort_index()

plt.figure(figsize=(18, 9))
sns.lineplot(data=frequent_activity_time_of_day)
plt.xticks(np.linspace(start=0, stop=24, num=25))
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [ ]:
#Most frequent activity time is 22H
In [ ]:
#How often the ip changes ?
In [76]:
base_ip = '48:E7:DA:58:22:E9'
ip_count = 0
for i in range(1,internet_usage.shape[0]):
    if internet_usage.iloc[i]['ip']!=base_ip:
        ip_count+=1
        base_ip = internet_usage.iloc[i]['ip']

print("The IP address changed" , str(ip_count), 'times')
The IP address changed 2303 times
In [ ]:
#How often the device changed.
In [78]:
base_device = 'device1'
device_count = 0
for i in range(1,internet_usage.shape[0]):
    if internet_usage.iloc[i]['device']!=base_device:
        device_count +=1
        base_device = internet_usage.iloc[i]['device']
    
print("the device was changed", str(device_count), 'times')
the device was changed 1223 times
In [ ]:
#What is the average usage per hour , per day and per month ?
In [80]:
internet_usage.reset_index(inplace=True)

internet_usage['day'] = internet_usage['start_time'].dt.day
internet_usage['month'] = internet_usage['start_time'].dt.month

hourly_average = internet_usage.groupby('hour').total_transfer.mean()
print('The Average usage per hour is:\n ' +  str(round(hourly_average, 2)))
      
The Average usage per hour is:
 hour
0     464530.44
1     530880.86
2     431576.11
3     345303.34
4     359809.44
5     275960.91
6     468959.59
7     292886.83
8     366681.92
9     377480.64
10    393259.12
11    309492.45
12    310137.98
13    335270.58
14    472403.71
15    517005.11
16    403919.40
17    525423.69
18    666590.76
19    389841.79
20    355862.80
21    474038.34
22    449600.50
23    407785.08
Name: total_transfer, dtype: float64
In [81]:
plt.figure(figsize=(18, 9))
sns.barplot(x='hour', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
plt.title("Average usage per hour")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [82]:
#average usage per day

daily_average = internet_usage.groupby('day').total_transfer.mean()
print('the daily average usage per day is', str(round(daily_average,2)))
the daily average usage per day is day
1     396705.04
2     494496.48
3     445865.63
4     676332.03
5     652195.66
6     396261.75
7     402259.89
8     301859.57
9     393521.97
10    350665.02
11    729857.65
12    346695.95
13    501906.70
14    352701.10
15    521520.51
16    426719.39
17    475795.71
18    337490.93
19    301941.32
20    365130.12
21    462211.69
22    486595.37
23    383153.93
24    320598.94
25    443689.47
26    463432.02
27    324318.12
28    494576.34
29    363645.61
30    361418.88
31    369118.01
Name: total_transfer, dtype: float64
In [83]:
plt.figure(figsize=(18, 9))
sns.barplot(x='day', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
plt.title("Average usage per day")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [86]:
monthly_average = internet_usage.groupby('month').total_transfer.mean()
print('the Average usage per month is ', str(round(monthly_average,2)))
the Average usage per month is  month
5     311177.16
6     338418.08
7     418583.99
8     479042.44
9     482955.52
10    549467.63
11    399804.11
Name: total_transfer, dtype: float64
In [88]:
plt.figure(figsize = (18,9))
sns.barplot(x = 'month', y = 'total_transfer', data = internet_usage, ci = None, estimator = np.mean)
plt.title('The average usage per month')
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
In [ ]:
#the highest average usage per month was during the month of October with 549467.63Kb total transfer of data