import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
internet_usage = pd.read_csv('internet_session.csv')
internet_usage.head()
name | start_time | usage_time | IP | MAC | upload | download | total_transfer | seession_break_reason | |
---|---|---|---|---|---|---|---|---|---|
0 | user1 | 2022-05-10 02:59:32 | 00:00:36:28 | 10.55.14.222 | 48:E7:DA:58:22:E9 | 15861.76 | 333168.64 | 349030.40 | Idle-Timeout |
1 | user1 | 2022-05-10 18:53:27 | 00:01:49:56 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 16957.44 | 212152.32 | 229109.76 | Idle-Timeout |
2 | user1 | 2022-05-10 21:20:44 | 00:01:35:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 14080.0 | 195153.92 | 209233.92 | Idle-Timeout |
3 | user1 | 2022-05-11 00:37:42 | 00:00:26:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 5242.88 | 40806.4 | 46049.28 | Idle-Timeout |
4 | user1 | 2022-05-11 02:59:38 | 00:00:11:52 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 22067.2 | 10772.48 | 32839.68 | Idle-Timeout |
internet_usage.shape
(4712, 9)
internet_usage.columns
Index(['name', 'start_time', 'usage_time', 'IP', 'MAC', 'upload', 'download', 'total_transfer', 'seession_break_reason'], dtype='object')
internet_usage.columns = internet_usage.columns.str.lower()
internet_usage.columns
Index(['name', 'start_time', 'usage_time', 'ip', 'mac', 'upload', 'download', 'total_transfer', 'seession_break_reason'], dtype='object')
internet_usage.dtypes
name object start_time object usage_time object ip object mac object upload object download object total_transfer float64 seession_break_reason object dtype: object
internet_usage.isna().sum()
name 0 start_time 0 usage_time 0 ip 0 mac 0 upload 0 download 0 total_transfer 0 seession_break_reason 9 dtype: int64
internet_usage = internet_usage.dropna().copy()
internet_usage.isna().sum()
name 0 start_time 0 usage_time 0 ip 0 mac 0 upload 0 download 0 total_transfer 0 seession_break_reason 0 dtype: int64
internet_usage.duplicated().sum()
0
internet_usage['usage_time'] = internet_usage['usage_time'].str.replace('00:', '', 1)
internet_usage['usage_time'] = pd.to_datetime(internet_usage['usage_time'])
internet_usage['upload'] = internet_usage['upload'].str.extract('(\d+)', expand=False)
internet_usage.upload = internet_usage.upload.astype(float)
internet_usage['download'] = internet_usage['download'].str.extract('(\d+)', expand=False)
internet_usage.download = internet_usage.download.astype(float)
device = []
basename = 'device'
mac = internet_usage['mac'][0]
device_number = 1
for i in internet_usage['mac']:
if i == mac:
device.append(basename + str(device_number))
else:
device_number += 1
device.append(basename + str(device_number))
mac = i
internet_usage['device'] = device
internet_usage.dtypes
name object start_time object usage_time datetime64[ns] ip object mac object upload float64 download float64 total_transfer float64 seession_break_reason object device object dtype: object
Exploratory Data Analysis
internet_usage.describe(include='all', datetime_is_numeric=True)
name | start_time | usage_time | ip | mac | upload | download | total_transfer | seession_break_reason | device | |
---|---|---|---|---|---|---|---|---|---|---|
count | 4703 | 4703 | 4703 | 4703 | 4703 | 4.703000e+03 | 4.703000e+03 | 4.703000e+03 | 4703 | 4703 |
unique | 9 | 4029 | NaN | 1299 | 33 | NaN | NaN | NaN | 5 | 1224 |
top | user4 | 2022-05-10 02:59:32 | NaN | 10.55.0.89 | 48:E7:DA:58:22:E9 | NaN | NaN | NaN | Idle-Timeout | device1206 |
freq | 725 | 2 | NaN | 80 | 1235 | NaN | NaN | NaN | 4350 | 194 |
mean | NaN | NaN | 2022-12-25 02:10:05.038486272 | NaN | NaN | 3.378702e+04 | 3.966645e+05 | 4.304372e+05 | NaN | NaN |
min | NaN | NaN | 2022-12-25 00:00:01 | NaN | NaN | 2.000000e+00 | 9.000000e+00 | 1.120000e+00 | NaN | NaN |
25% | NaN | NaN | 2022-12-25 00:31:42 | NaN | NaN | 6.082000e+03 | 5.199800e+04 | 6.187008e+04 | NaN | NaN |
50% | NaN | NaN | 2022-12-25 01:19:40 | NaN | NaN | 1.531900e+04 | 1.782680e+05 | 2.027930e+05 | NaN | NaN |
75% | NaN | NaN | 2022-12-25 02:49:02 | NaN | NaN | 3.399600e+04 | 4.593660e+05 | 4.993997e+05 | NaN | NaN |
max | NaN | NaN | 2022-12-25 22:00:07 | NaN | NaN | 2.841640e+06 | 2.790261e+07 | 2.855272e+07 | NaN | NaN |
std | NaN | NaN | NaN | NaN | NaN | 9.493243e+04 | 9.657778e+05 | 9.960848e+05 | NaN | NaN |
internet_usage.name.value_counts()
user4 725 user6 674 user1 673 user9 571 user7 526 user3 518 user2 456 user5 335 user8 225 Name: name, dtype: int64
plt.figure(figsize=(18, 9))
ax = sns.countplot(x='name', data=internet_usage)
ax.bar_label(ax.containers[0])
plt.title("User Count")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The earliest start time is:')
print(internet_usage.start_time.min())
print('The latest start time is:')
print(internet_usage.start_time.max())
The earliest start time is: 2022-05-09 22:52:41 The latest start time is: 2022-11-05 18:41:14
print('The minimum usage time is:')
print(internet_usage.usage_time.min())
print('The maximum usage time is:')
print(internet_usage.usage_time.max())
print('The average usage time is:')
print(internet_usage.usage_time.mean())
The minimum usage time is: 2022-12-25 00:00:01 The maximum usage time is: 2022-12-25 22:00:07 The average usage time is: 2022-12-25 02:10:05.038486272
print('The minimum usage time per user:')
usage_time_minimum = internet_usage.groupby('name').usage_time.min()
usage_time_minimum
The minimum usage time per user:
name user1 2022-12-25 00:00:18 user2 2022-12-25 00:00:08 user3 2022-12-25 00:00:01 user4 2022-12-25 00:00:45 user5 2022-12-25 00:01:07 user6 2022-12-25 00:00:18 user7 2022-12-25 00:00:20 user8 2022-12-25 00:00:20 user9 2022-12-25 00:00:09 Name: usage_time, dtype: datetime64[ns]
plt.figure(figsize=(18, 9))
usage_time_minimum.plot(kind='bar', logy=True)
plt.title("The minimum usage time per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The maximum usage time per user:')
usage_time_maximum = internet_usage.groupby('name').usage_time.max()
usage_time_maximum
The maximum usage time per user:
name user1 2022-12-25 19:35:11 user2 2022-12-25 20:39:52 user3 2022-12-25 17:01:28 user4 2022-12-25 18:11:43 user5 2022-12-25 06:36:11 user6 2022-12-25 19:35:11 user7 2022-12-25 22:00:07 user8 2022-12-25 17:24:26 user9 2022-12-25 19:26:09 Name: usage_time, dtype: datetime64[ns]
plt.figure(figsize=(18, 9))
usage_time_maximum.plot(kind='bar', logy=True)
plt.title("The maximum usage time per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
user 7 has the greatest maximum time usage with 22 hours and 7 seconds, while user 5 has the least amount with only 6 hours 36 minutes and 11 seconds
print('The average usage time per user:')
usage_time_average = internet_usage.groupby('name').usage_time.mean()
usage_time_average
The average usage time per user:
name user1 2022-12-25 01:42:47.665676032 user2 2022-12-25 01:42:53.866227968 user3 2022-12-25 02:19:42.019305216 user4 2022-12-25 02:38:01.766896640 user5 2022-12-25 01:20:11.701492736 user6 2022-12-25 01:42:49.998516224 user7 2022-12-25 02:17:45.053231872 user8 2022-12-25 04:03:14.555555584 user9 2022-12-25 02:29:32.180385280 Name: usage_time, dtype: datetime64[ns]
plt.figure(figsize=(18, 9))
usage_time_average.plot(kind='bar', logy=True)
plt.title("The average usage time per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
We can see that on average, User 8 has the most usage with 4 hours 3 minutes and 14 seconds, while User 5 has the least amount of average time usage with one hour 20 minutes and 11 seconds
internet_usage.ip.value_counts()
10.55.0.89 80 10.55.14.148 64 10.55.15.221 55 10.55.1.50 48 10.55.10.46 44 .. 10.55.15.44 1 10.55.15.237 1 10.55.14.166 1 10.55.3.200 1 10.55.4.159 1 Name: ip, Length: 1299, dtype: int64
internet_usage.device.value_counts()
device1206 194 device835 137 device11 137 device1212 132 device312 113 ... device582 1 device583 1 device584 1 device585 1 device613 1 Name: device, Length: 1224, dtype: int64
print('The minimum upload is: ' + str(internet_usage.upload.min()) + 'Kb')
print('The maximum upload is: ' + str(internet_usage.upload.max()) + 'Kb')
print('The average upload is: ' + str(round(internet_usage.upload.mean(), 2)) + 'Kb')
The minimum upload is: 2.0Kb The maximum upload is: 2841640.0Kb The average upload is: 33787.02Kb
We will start with the minimum upload per user
print('The minimum upload per user:')
internet_usage.groupby('name').upload.min()
The minimum upload per user:
name user1 19.0 user2 23.0 user3 36.0 user4 56.0 user5 382.0 user6 19.0 user7 2.0 user8 4.0 user9 41.0 Name: upload, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='upload' , data=internet_usage, ci=None, estimator=np.min)
ax.bar_label(ax.containers[0])
plt.title("Minimum upload per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The maximum upload per user:')
internet_usage.groupby('name').upload.max()
The maximum upload per user:
name user1 638566.0 user2 379955.0 user3 1625292.0 user4 754462.0 user5 2841640.0 user6 638566.0 user7 653731.0 user8 709058.0 user9 1352663.0 Name: upload, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='upload' , data=internet_usage, ci=None, estimator=np.max)
ax.bar_label(ax.containers[0])
plt.title("Maximum upload per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The average upload per user:')
round(internet_usage.groupby('name').upload.mean(), 2)
The average upload per user:
name user1 27291.34 user2 29188.79 user3 29594.88 user4 33783.74 user5 64500.35 user6 27700.73 user7 23075.54 user8 59190.12 user9 37944.66 Name: upload, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='upload' , data=internet_usage, ci=None, estimator=np.mean)
ax.bar_label(ax.containers[0])
plt.title("Average upload per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The minimum download is: ' + str(internet_usage.download.min()) + 'Kb')
print('The maximum download is: ' + str(internet_usage.download.max()) + 'Kb')
print('The average download is: ' + str(round(internet_usage.download.mean(), 2)) + 'Kb')
The minimum download is: 9.0Kb The maximum download is: 27902607.0Kb The average download is: 396664.52Kb
print('The minimum download per user:')
internet_usage.groupby('name').download.min()
The minimum download per user:
name user1 50.0 user2 49.0 user3 60.0 user4 12.0 user5 461.0 user6 50.0 user7 9.0 user8 13.0 user9 61.0 Name: download, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='download' , data=internet_usage, ci=None, estimator=np.min)
ax.bar_label(ax.containers[0])
plt.title("Minimum download per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The maximum download per user:')
internet_usage.groupby('name').download.max()
The maximum download per user:
name user1 23760732.0 user2 21831352.0 user3 3145728.0 user4 8325693.0 user5 5033164.0 user6 23760732.0 user7 27902607.0 user8 2747269.0 user9 6008340.0 Name: download, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='download' , data=internet_usage, ci=None, estimator=np.max)
ax.bar_label(ax.containers[0])
plt.title("Maximum download per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The average download per user:')
round(internet_usage.groupby('name').download.mean(), 2)
The average download per user:
name user1 270725.96 user2 573798.02 user3 342230.37 user4 408580.26 user5 357278.08 user6 270545.18 user7 453828.61 user8 341417.12 user9 578981.51 Name: download, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='download' , data=internet_usage, ci=None, estimator=np.mean)
ax.bar_label(ax.containers[0])
plt.title("Average download per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The minimum total transfer is: ' + str(internet_usage.total_transfer.min()) + 'Kb')
print('The maximum total transfer is: ' + str(internet_usage.total_transfer.max()) + 'Kb')
print('The average total transfer is: ' + str(round(internet_usage.total_transfer.mean(), 2)) + 'Kb')
The minimum total transfer is: 1.12Kb The maximum total transfer is: 28552724.48Kb The average total transfer is: 430437.21Kb
print('The minimum total transfer per user:')
internet_usage.groupby('name').total_transfer.min()
The minimum total transfer per user:
name user1 75.34 user2 73.13 user3 1.12 user4 13.45 user5 924.40 user6 75.34 user7 15.25 user8 18.08 user9 102.64 Name: total_transfer, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='total_transfer' , data=internet_usage, ci=None, estimator=np.min)
ax.bar_label(ax.containers[0])
plt.title("Minimum total transfer per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The maximum total transfer per user:')
internet_usage.groupby('name').total_transfer.max()
The maximum total transfer per user:
name user1 24389877.76 user2 22051553.28 user3 3282042.88 user4 8524922.88 user5 5158993.92 user6 24389877.76 user7 28552724.48 user8 3166699.52 user9 6155141.12 Name: total_transfer, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='total_transfer' , data=internet_usage, ci=None, estimator=np.max)
ax.bar_label(ax.containers[0])
plt.title("Maximum total transfer per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
print('The average total transfer per user:')
round(internet_usage.groupby('name').total_transfer.mean(), 2)
The average total transfer per user:
name user1 297971.21 user2 602904.19 user3 371826.53 user4 442413.51 user5 421772.04 user6 298199.88 user7 476923.04 user8 400682.28 user9 616875.57 Name: total_transfer, dtype: float64
plt.figure(figsize=(18, 9))
ax = sns.barplot(x='name', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
ax.bar_label(ax.containers[0])
plt.title("Average total transfer per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
internet_usage.seession_break_reason.value_counts()
Idle-Timeout 4350 Lost-Carrier 162 Lost-Service 124 User-Request 65 NAS-Reboot 2 Name: seession_break_reason, dtype: int64
plt.figure(figsize=(18, 9))
ax = sns.countplot(x='seession_break_reason' , data=internet_usage)
ax.bar_label(ax.containers[0])
plt.title("Session break reason Count")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
plt.figure(figsize=(18, 9))
ax = sns.countplot(x='name', hue = 'seession_break_reason' , data=internet_usage)
ax.bar_label(ax.containers[0])
plt.title("Session break reason Count per user")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
If we check the session break reasons count per user, it's the same thing, the majority are from "Idle-Timeout" while the other reasons have very low occurrences