import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
internet_usage = pd.read_csv('C:/Users/gsaladi/OneDrive - FactSet/Desktop/jntuh/ML DL Assignment 1/internet_session.csv', parse_dates=['start_time'])
internet_usage
name | start_time | usage_time | IP | MAC | upload | download | total_transfer | seession_break_reason | |
---|---|---|---|---|---|---|---|---|---|
0 | user1 | 2022-05-10 02:59:32 | 00:00:36:28 | 10.55.14.222 | 48:E7:DA:58:22:E9 | 15861.76 | 333168.64 | 349030.40 | Idle-Timeout |
1 | user1 | 2022-05-10 18:53:27 | 00:01:49:56 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 16957.44 | 212152.32 | 229109.76 | Idle-Timeout |
2 | user1 | 2022-05-10 21:20:44 | 00:01:35:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 14080.0 | 195153.92 | 209233.92 | Idle-Timeout |
3 | user1 | 2022-05-11 00:37:42 | 00:00:26:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 5242.88 | 40806.4 | 46049.28 | Idle-Timeout |
4 | user1 | 2022-05-11 02:59:38 | 00:00:11:52 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 22067.2 | 10772.48 | 32839.68 | Idle-Timeout |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4707 | user9 | 2022-11-04 01:11:34 | 00:06:54:32 | 10.55.4.189 | DA:2F:97:0E:B7:D0 | 107960.32 | 2390753.28 | 2495610.88 | Idle-Timeout |
4708 | user9 | 2022-11-04 10:26:09 | 00:00:23:49 | 10.55.4.59 | DA:2F:97:0E:B7:D0 | 11407.36 | 209674.24 | 221081.60 | Idle-Timeout |
4709 | user9 | 2022-11-04 20:41:42 | 00:01:24:13 | 10.55.15.186 | DA:2F:97:0E:B7:D0 | 18995.2 | 373657.6 | 392652.80 | Idle-Timeout |
4710 | user9 | 2022-11-05 00:21:06 | 00:08:49:43 | 10.55.4.159 | DA:2F:97:0E:B7:D0 | 46602.24 | 593766.4 | 640368.64 | Idle-Timeout |
4711 | user9 | 2022-11-05 20:55:37 | 00:01:06:20 | 10.55.2.33 | DA:2F:97:0E:B7:D0 | 21237.76 | 298536.96 | 319774.72 | NaN |
4712 rows × 9 columns
internet_usage.columns
Index(['name', 'start_time', 'usage_time', 'IP', 'MAC', 'upload', 'download', 'total_transfer', 'seession_break_reason'], dtype='object')
internet_usage.isna().sum()
name 0 start_time 0 usage_time 0 IP 0 MAC 0 upload 0 download 0 total_transfer 0 seession_break_reason 9 dtype: int64
internet_usage = internet_usage.dropna().copy()
internet_usage.isna().sum()
name 0 start_time 0 usage_time 0 IP 0 MAC 0 upload 0 download 0 total_transfer 0 seession_break_reason 0 dtype: int64
device = []
basename = 'device'
mac = internet_usage['MAC'][0]
device_number = 1
for i in internet_usage['MAC']:
if i == mac:
device.append(basename + str(device_number))
else:
device_number += 1
device.append(basename + str(device_number))
mac = i
internet_usage['device'] = device
internet_usage.dtypes
name object start_time datetime64[ns] usage_time object IP object MAC object upload object download object total_transfer float64 seession_break_reason object device object dtype: object
internet_usage['hour'] = pd.to_datetime(internet_usage['start_time']).dt.hour
frequent_activity_time_of_day = internet_usage['hour'].value_counts().sort_index()
plt.figure(figsize=(18, 9))
sns.lineplot(data=frequent_activity_time_of_day)
plt.xticks(np.linspace(start=0, stop=24, num=25))
plt.show()
plt.clf()
print ("Most frequent activity time of day is" , internet_usage.usage_time.max())
Most frequent activity time of day is 01:00:21:07
<Figure size 432x288 with 0 Axes>
base_ip = '48:E7:DA:58:22:E9'
ip_count = 0
for i in range(1, internet_usage.shape[0]):
if internet_usage.iloc[i]['IP'] != base_ip:
ip_count +=1
base_ip = internet_usage.iloc[i]['IP']
print('The IP Adress changed ' + str(ip_count) + ' times')
The IP Adress changed 2303 times
base_device = 'device1'
device_count = 0
for i in range(1, internet_usage.shape[0]):
if internet_usage.iloc[i]['device'] != base_device:
device_count +=1
base_device = internet_usage.iloc[i]['device']
print('The device changed ' + str(device_count) + ' times')
The device changed 1223 times
internet_usage['day'] = internet_usage['start_time'].dt.day
internet_usage['month'] = internet_usage['start_time'].dt.month
hourly_average = internet_usage.groupby('hour').total_transfer.mean()
print('The Average usage per hour is:\n ' + str(round(hourly_average, 2)))
The Average usage per hour is: hour 0 464530.44 1 530880.86 2 431576.11 3 345303.34 4 359809.44 5 275960.91 6 468959.59 7 292886.83 8 366681.92 9 377480.64 10 393259.12 11 309492.45 12 310137.98 13 335270.58 14 472403.71 15 517005.11 16 403919.40 17 525423.69 18 666590.76 19 389841.79 20 355862.80 21 474038.34 22 449600.50 23 407785.08 Name: total_transfer, dtype: float64
plt.figure(figsize=(15, 7))
sns.barplot(x='hour', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
plt.title("Average usage per hour")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
daily_average = internet_usage.groupby('day').total_transfer.mean()
print('The Average usage per day is:\n ' + str(round(daily_average, 2)))
The Average usage per day is: day 1 396705.04 2 494496.48 3 445865.63 4 676332.03 5 652195.66 6 396261.75 7 402259.89 8 301859.57 9 393521.97 10 350665.02 11 729857.65 12 346695.95 13 501906.70 14 352701.10 15 521520.51 16 426719.39 17 475795.71 18 337490.93 19 301941.32 20 365130.12 21 462211.69 22 486595.37 23 383153.93 24 320598.94 25 443689.47 26 463432.02 27 324318.12 28 494576.34 29 363645.61 30 361418.88 31 369118.01 Name: total_transfer, dtype: float64
plt.figure(figsize=(15, 7))
sns.barplot(x='day', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
plt.title("Average usage per day")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
monthly_average = internet_usage.groupby('month').total_transfer.mean()
print('The Average usage per month is:\n ' + str(round(monthly_average, 2)))
The Average usage per month is: month 5 311177.16 6 338418.08 7 418583.99 8 479042.44 9 482955.52 10 549467.63 11 399804.11 Name: total_transfer, dtype: float64
plt.figure(figsize=(15, 7))
sns.barplot(x='month', y='total_transfer' , data=internet_usage, ci=None, estimator=np.mean)
plt.title("Average usage per month")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>