import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
internet_usage = pd.read_csv('internet_session.csv', parse_dates=['start_time'])
internet_usage.head()
name | start_time | usage_time | IP | MAC | upload | download | total_transfer | seession_break_reason | |
---|---|---|---|---|---|---|---|---|---|
0 | user1 | 2022-05-10 02:59:32 | 00:00:36:28 | 10.55.14.222 | 48:E7:DA:58:22:E9 | 15861.76 | 333168.64 | 349030.40 | Idle-Timeout |
1 | user1 | 2022-05-10 18:53:27 | 00:01:49:56 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 16957.44 | 212152.32 | 229109.76 | Idle-Timeout |
2 | user1 | 2022-05-10 21:20:44 | 00:01:35:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 14080.0 | 195153.92 | 209233.92 | Idle-Timeout |
3 | user1 | 2022-05-11 00:37:42 | 00:00:26:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 5242.88 | 40806.4 | 46049.28 | Idle-Timeout |
4 | user1 | 2022-05-11 02:59:38 | 00:00:11:52 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 22067.2 | 10772.48 | 32839.68 | Idle-Timeout |
internet_usage.dtypes
name object start_time datetime64[ns] usage_time object IP object MAC object upload object download object total_transfer float64 seession_break_reason object dtype: object
internet_usage.shape
(4712, 9)
internet_usage.columns
Index(['name', 'start_time', 'usage_time', 'IP', 'MAC', 'upload', 'download', 'total_transfer', 'seession_break_reason'], dtype='object')
internet_usage.columns = internet_usage.columns.str.lower()
internet_usage.columns
Index(['name', 'start_time', 'usage_time', 'ip', 'mac', 'upload', 'download', 'total_transfer', 'seession_break_reason'], dtype='object')
internet_usage.isna().sum()
name 0 start_time 0 usage_time 0 ip 0 mac 0 upload 0 download 0 total_transfer 0 seession_break_reason 9 dtype: int64
internet_usage = internet_usage.dropna().copy()
internet_usage.isna().sum()
name 0 start_time 0 usage_time 0 ip 0 mac 0 upload 0 download 0 total_transfer 0 seession_break_reason 0 dtype: int64
internet_usage.duplicated().sum()
0
internet_usage.dtypes
name object start_time datetime64[ns] usage_time datetime64[ns] ip object mac object upload float64 download float64 total_transfer float64 seession_break_reason object dtype: object
internet_usage.describe(include ='all', datetime_is_numeric = True)
name | start_time | usage_time | ip | mac | upload | download | total_transfer | seession_break_reason | |
---|---|---|---|---|---|---|---|---|---|
count | 4703 | 4703 | 4703 | 4703 | 4703 | 4.703000e+03 | 4.703000e+03 | 4.703000e+03 | 4703 |
unique | 9 | NaN | NaN | 1299 | 33 | NaN | NaN | NaN | 5 |
top | user4 | NaN | NaN | 10.55.0.89 | 48:E7:DA:58:22:E9 | NaN | NaN | NaN | Idle-Timeout |
freq | 725 | NaN | NaN | 80 | 1235 | NaN | NaN | NaN | 4350 |
mean | NaN | 2022-08-08 09:35:44.875185920 | 2023-01-08 02:10:05.038486272 | NaN | NaN | 3.378702e+04 | 3.966645e+05 | 4.304372e+05 | NaN |
min | NaN | 2022-05-09 22:52:41 | 2023-01-08 00:00:01 | NaN | NaN | 2.000000e+00 | 9.000000e+00 | 1.120000e+00 | NaN |
25% | NaN | 2022-06-14 18:33:06.500000 | 2023-01-08 00:31:42 | NaN | NaN | 6.082000e+03 | 5.199800e+04 | 6.187008e+04 | NaN |
50% | NaN | 2022-08-19 13:56:28 | 2023-01-08 01:19:40 | NaN | NaN | 1.531900e+04 | 1.782680e+05 | 2.027930e+05 | NaN |
75% | NaN | 2022-09-24 22:30:58.500000 | 2023-01-08 02:49:02 | NaN | NaN | 3.399600e+04 | 4.593660e+05 | 4.993997e+05 | NaN |
max | NaN | 2022-11-05 18:41:14 | 2023-01-08 22:00:07 | NaN | NaN | 2.841640e+06 | 2.790261e+07 | 2.855272e+07 | NaN |
std | NaN | NaN | NaN | NaN | NaN | 9.493243e+04 | 9.657778e+05 | 9.960848e+05 | NaN |
internet_usage.name.value_counts()
user4 725 user6 674 user1 673 user9 571 user7 526 user3 518 user2 456 user5 335 user8 225 Name: name, dtype: int64
plt.figure(figsize = (18,9))
ax = sns.countplot(x='name', data = internet_usage)
ax.bar_label(ax.containers[0])
plt.title("User Count")
plt.show()
plt.clf
<function matplotlib.pyplot.clf()>
print('The earliest start time is:')
print(internet_usage.start_time.min())
print('The latest start time is:')
print(internet_usage.start_time.max())
The earliest start time is: 2022-05-09 22:52:41 The latest start time is: 2022-11-05 18:41:14
print('The earliest usage time is:')
print(internet_usage.usage_time.min())
print('The latest usage time is:')
print(internet_usage.usage_time.max())
The earliest usage time is: 2023-01-08 00:00:01 The latest usage time is: 2023-01-08 22:00:07
print('The minimum usage time per user is:')
usage_time_minimum = internet_usage.groupby('name').usage_time.min()
usage_time_minimum
The minimum usage time per user is:
name user1 2023-01-08 00:00:18 user2 2023-01-08 00:00:08 user3 2023-01-08 00:00:01 user4 2023-01-08 00:00:45 user5 2023-01-08 00:01:07 user6 2023-01-08 00:00:18 user7 2023-01-08 00:00:20 user8 2023-01-08 00:00:20 user9 2023-01-08 00:00:09 Name: usage_time, dtype: datetime64[ns]
plt.figure(figsize=(18, 9))
usage_time_minimum.plot(kind='bar', logy=True)
plt.title("The minimum usage time per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The maximum usage time per user:')
usage_time_maximum = internet_usage.groupby('name').usage_time.max()
usage_time_maximum
The maximum usage time per user:
name user1 2023-01-08 19:35:11 user2 2023-01-08 20:39:52 user3 2023-01-08 17:01:28 user4 2023-01-08 18:11:43 user5 2023-01-08 06:36:11 user6 2023-01-08 19:35:11 user7 2023-01-08 22:00:07 user8 2023-01-08 17:24:26 user9 2023-01-08 19:26:09 Name: usage_time, dtype: datetime64[ns]
plt.figure(figsize=(18, 9))
usage_time_maximum.plot(kind='bar', logy=True)
plt.title("The maximum usage time per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print("the average usage time per user is:")
usage_time_average = internet_usage.groupby('name').usage_time.mean()
usage_time_average
the average usage time per user is:
name user1 2023-01-08 01:42:47.665676032 user2 2023-01-08 01:42:53.866227968 user3 2023-01-08 02:19:42.019305216 user4 2023-01-08 02:38:01.766896640 user5 2023-01-08 01:20:11.701492736 user6 2023-01-08 01:42:49.998516224 user7 2023-01-08 02:17:45.053231872 user8 2023-01-08 04:03:14.555555584 user9 2023-01-08 02:29:32.180385280 Name: usage_time, dtype: datetime64[ns]
plt.figure(figsize=(18, 9))
usage_time_average.plot(kind='bar', logy=True)
plt.title("The average usage time per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
internet_usage.ip.value_counts()
10.55.0.89 80 10.55.14.148 64 10.55.15.221 55 10.55.1.50 48 10.55.10.46 44 .. 10.55.15.44 1 10.55.15.237 1 10.55.14.166 1 10.55.3.200 1 10.55.4.159 1 Name: ip, Length: 1299, dtype: int64
#most used ip address is 10.55.0.89
device = []
basename = 'device'
mac = internet_usage['mac'][0]
device_number = 1
for i in internet_usage['mac']:
if i == mac:
device.append(basename + str(device_number))
else:
device_number += 1
device.append(basename + str(device_number))
mac = i
internet_usage['device'] = device
internet_usage.device.value_counts()
device1206 194 device835 137 device11 137 device1212 132 device312 113 ... device582 1 device583 1 device584 1 device585 1 device613 1 Name: device, Length: 1224, dtype: int64
#most used device is 1206 with 194 times use
print('The minimum upload is :' ,str(internet_usage.upload.min()), 'kb')
print('The maximum upload is :', str(internet_usage.upload.max()), 'kb')
print('The average upload is:', str(round(internet_usage.upload.mean(),2)),'kb')
The minimum upload is : 2.0 kb The maximum upload is : 2841640.0 kb The average upload is: 33787.02 kb
print("the minimum upload per user:")
internet_usage.groupby('name').upload.min()
the minimum upload per user:
name user1 19.0 user2 23.0 user3 36.0 user4 56.0 user5 382.0 user6 19.0 user7 2.0 user8 4.0 user9 41.0 Name: upload, dtype: float64
plt.figure(figsize = (18,9))
ax = sns.barplot(x='name',y = 'upload', data = internet_usage, ci = None , estimator= np.min)
ax.bar_label(ax.containers[0])
plt.title('Minimum upload per user')
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The maximum upload per user is:')
internet_usage.groupby('name').upload.max()
The maximum upload per user is:
name user1 638566.0 user2 379955.0 user3 1625292.0 user4 754462.0 user5 2841640.0 user6 638566.0 user7 653731.0 user8 709058.0 user9 1352663.0 Name: upload, dtype: float64
plt.figure(figsize = (18,9))
ax = sns.barplot(x ='name',y = 'upload', data = internet_usage, ci = None, estimator = np.max)
ax.bar_label(ax.containers[0])
plt.title('Maximum upload per user:')
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print("the average upload per user is")
round(internet_usage.groupby('name').upload.mean(),2)
the average upload per user is
name user1 27291.34 user2 29188.79 user3 29594.88 user4 33783.74 user5 64500.35 user6 27700.73 user7 23075.54 user8 59190.12 user9 37944.66 Name: upload, dtype: float64
plt.figure(figsize = (18,9))
ax = sns.barplot(x = 'name', y = 'upload', data = internet_usage , ci = None, estimator = np.mean)
ax.bar_label(ax.containers[0])
plt.title('Average upload per user')
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('the minimum download per user is:')
internet_usage.groupby('name').download.min()
the minimum download per user is:
name user1 50.0 user2 49.0 user3 60.0 user4 12.0 user5 461.0 user6 50.0 user7 9.0 user8 13.0 user9 61.0 Name: download, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='download' , data=internet_usage, ci=None, estimator=np.min)
ax.bar_label(ax.containers[0])
plt.title("Minimum download per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('the maximum download per user is:')
internet_usage.groupby('name').download.max()
the maximum download per user is:
name user1 23760732.0 user2 21831352.0 user3 3145728.0 user4 8325693.0 user5 5033164.0 user6 23760732.0 user7 27902607.0 user8 2747269.0 user9 6008340.0 Name: download, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='download' , data=internet_usage, ci=None, estimator=np.max)
ax.bar_label(ax.containers[0])
plt.title("Maximum download per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('the average download per user is:')
internet_usage.groupby('name').download.mean()
the average download per user is:
name user1 270725.964339 user2 573798.015351 user3 342230.372587 user4 408580.259310 user5 357278.077612 user6 270545.182493 user7 453828.606464 user8 341417.124444 user9 578981.506130 Name: download, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='download' , data=internet_usage, ci=None, estimator=np.mean)
ax.bar_label(ax.containers[0])
plt.title("Average download per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print("the minimum total transfer is:" , str(internet_usage.total_transfer.min()), 'kb')
print('the maximum total transfer is:', str(internet_usage.total_transfer.max()),'kb')
print('the average total transfer is: ', str(internet_usage.total_transfer.mean()),'kb')
the minimum total transfer is: 1.12 kb the maximum total transfer is: 28552724.48 kb the average total transfer is: 430437.2108228794 kb
print('the minimum total transfer per user is')
internet_usage.groupby('name').total_transfer.min()
the minimum total transfer per user is
name user1 75.34 user2 73.13 user3 1.12 user4 13.45 user5 924.40 user6 75.34 user7 15.25 user8 18.08 user9 102.64 Name: total_transfer, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='total_transfer' , data=internet_usage, ci=None, estimator=np.min)
ax.bar_label(ax.containers[0])
plt.title("Minimum total transfer per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('the maximum total transfer per user is')
internet_usage.groupby('name').total_transfer.max()
the maximum total transfer per user is
name user1 24389877.76 user2 22051553.28 user3 3282042.88 user4 8524922.88 user5 5158993.92 user6 24389877.76 user7 28552724.48 user8 3166699.52 user9 6155141.12 Name: total_transfer, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='total_transfer' , data=internet_usage, ci=None, estimator=np.max
ax.bar_label(ax.containers[0])
plt.title("Maximum total transfer per user:")
plt.show()
plt.clf()
Input In [69] ax.bar_label(ax.containers[0]) ^ SyntaxError: invalid syntax
print('The average total transfer per user:')
round(internet_usage.groupby('name').total_transfer.mean(), 2)
The average total transfer per user:
name user1 297971.21 user2 602904.19 user3 371826.53 user4 442413.51 user5 421772.04 user6 298199.88 user7 476923.04 user8 400682.28 user9 616875.57 Name: total_transfer, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
ax.bar_label(ax.containers[0])
plt.title("Average total transfer per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
internet_usage.seession_break_reason.value_counts()
Idle-Timeout 4350 Lost-Carrier 162 Lost-Service 124 User-Request 65 NAS-Reboot 2 Name: seession_break_reason, dtype: int64
#max reason is idle-timeout
plt.figure(figsize=(18, 9))
ax = sns.countplot(x='seession_break_reason' , data=internet_usage)
ax.bar_label(ax.containers[0])
plt.title("Session break reason Count")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
plt.figure(figsize=(18, 9))
ax = sns.countplot(x='name', hue = 'seession_break_reason' , data=internet_usage)
ax.bar_label(ax.containers[0])
plt.title("Session break reason Count per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
#What is the most frequent internet activity time of the day ?
internet_usage['hour'] = pd.to_datetime(internet_usage['start_time']).dt.hour
frequent_activity_time_of_day = internet_usage['hour'].value_counts().sort_index()
plt.figure(figsize=(18, 9))
sns.lineplot(data=frequent_activity_time_of_day)
plt.xticks(np.linspace(start=0, stop=24, num=25))
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
#Most frequent activity time is 22H
#How often the ip changes ?
base_ip = '48:E7:DA:58:22:E9'
ip_count = 0
for i in range(1,internet_usage.shape[0]):
if internet_usage.iloc[i]['ip']!=base_ip:
ip_count+=1
base_ip = internet_usage.iloc[i]['ip']
print("The IP address changed" , str(ip_count), 'times')
The IP address changed 2303 times
#How often the device changed.
base_device = 'device1'
device_count = 0
for i in range(1,internet_usage.shape[0]):
if internet_usage.iloc[i]['device']!=base_device:
device_count +=1
base_device = internet_usage.iloc[i]['device']
print("the device was changed", str(device_count), 'times')
the device was changed 1223 times
#What is the average usage per hour , per day and per month ?
internet_usage.reset_index(inplace=True)
internet_usage['day'] = internet_usage['start_time'].dt.day
internet_usage['month'] = internet_usage['start_time'].dt.month
hourly_average = internet_usage.groupby('hour').total_transfer.mean()
print('The Average usage per hour is:\n ' + str(round(hourly_average, 2)))
The Average usage per hour is: hour 0 464530.44 1 530880.86 2 431576.11 3 345303.34 4 359809.44 5 275960.91 6 468959.59 7 292886.83 8 366681.92 9 377480.64 10 393259.12 11 309492.45 12 310137.98 13 335270.58 14 472403.71 15 517005.11 16 403919.40 17 525423.69 18 666590.76 19 389841.79 20 355862.80 21 474038.34 22 449600.50 23 407785.08 Name: total_transfer, dtype: float64
plt.figure(figsize=(18, 9))
sns.barplot(x='hour', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
plt.title("Average usage per hour")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
#average usage per day
daily_average = internet_usage.groupby('day').total_transfer.mean()
print('the daily average usage per day is', str(round(daily_average,2)))
the daily average usage per day is day 1 396705.04 2 494496.48 3 445865.63 4 676332.03 5 652195.66 6 396261.75 7 402259.89 8 301859.57 9 393521.97 10 350665.02 11 729857.65 12 346695.95 13 501906.70 14 352701.10 15 521520.51 16 426719.39 17 475795.71 18 337490.93 19 301941.32 20 365130.12 21 462211.69 22 486595.37 23 383153.93 24 320598.94 25 443689.47 26 463432.02 27 324318.12 28 494576.34 29 363645.61 30 361418.88 31 369118.01 Name: total_transfer, dtype: float64
plt.figure(figsize=(18, 9))
sns.barplot(x='day', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
plt.title("Average usage per day")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
monthly_average = internet_usage.groupby('month').total_transfer.mean()
print('the Average usage per month is ', str(round(monthly_average,2)))
the Average usage per month is month 5 311177.16 6 338418.08 7 418583.99 8 479042.44 9 482955.52 10 549467.63 11 399804.11 Name: total_transfer, dtype: float64
plt.figure(figsize = (18,9))
sns.barplot(x = 'month', y = 'total_transfer', data = internet_usage, ci = None, estimator = np.mean)
plt.title('The average usage per month')
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
#the highest average usage per month was during the month of October with 549467.63Kb total transfer of data