Answer for the following questions usign Data Analysis.
What is the most frequent internet activity time of the day ?
How often the ip changes ?
How often the device changed.
What is the average usage per hour , per day and per month ?
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
path = 'C:\\Users\\mahit\\OneDrive\\Desktop\\DSPP\\ML & DL\\Assignments\\JNTUH ML DL Assignment 1\\'
df = pd.read_csv(path+'internet_session.csv')
df
name | start_time | usage_time | IP | MAC | upload | download | total_transfer | seession_break_reason | |
---|---|---|---|---|---|---|---|---|---|
0 | user1 | 2022-05-10 02:59:32 | 00:00:36:28 | 10.55.14.222 | 48:E7:DA:58:22:E9 | 15861.76 | 333168.64 | 349030.40 | Idle-Timeout |
1 | user1 | 2022-05-10 18:53:27 | 00:01:49:56 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 16957.44 | 212152.32 | 229109.76 | Idle-Timeout |
2 | user1 | 2022-05-10 21:20:44 | 00:01:35:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 14080.0 | 195153.92 | 209233.92 | Idle-Timeout |
3 | user1 | 2022-05-11 00:37:42 | 00:00:26:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 5242.88 | 40806.4 | 46049.28 | Idle-Timeout |
4 | user1 | 2022-05-11 02:59:38 | 00:00:11:52 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 22067.2 | 10772.48 | 32839.68 | Idle-Timeout |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4707 | user9 | 2022-11-04 01:11:34 | 00:06:54:32 | 10.55.4.189 | DA:2F:97:0E:B7:D0 | 107960.32 | 2390753.28 | 2495610.88 | Idle-Timeout |
4708 | user9 | 2022-11-04 10:26:09 | 00:00:23:49 | 10.55.4.59 | DA:2F:97:0E:B7:D0 | 11407.36 | 209674.24 | 221081.60 | Idle-Timeout |
4709 | user9 | 2022-11-04 20:41:42 | 00:01:24:13 | 10.55.15.186 | DA:2F:97:0E:B7:D0 | 18995.2 | 373657.6 | 392652.80 | Idle-Timeout |
4710 | user9 | 2022-11-05 00:21:06 | 00:08:49:43 | 10.55.4.159 | DA:2F:97:0E:B7:D0 | 46602.24 | 593766.4 | 640368.64 | Idle-Timeout |
4711 | user9 | 2022-11-05 20:55:37 | 00:01:06:20 | 10.55.2.33 | DA:2F:97:0E:B7:D0 | 21237.76 | 298536.96 | 319774.72 | NaN |
4712 rows × 9 columns
df.head()
name | start_time | usage_time | IP | MAC | upload | download | total_transfer | seession_break_reason | |
---|---|---|---|---|---|---|---|---|---|
0 | user1 | 2022-05-10 02:59:32 | 00:00:36:28 | 10.55.14.222 | 48:E7:DA:58:22:E9 | 15861.76 | 333168.64 | 349030.40 | Idle-Timeout |
1 | user1 | 2022-05-10 18:53:27 | 00:01:49:56 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 16957.44 | 212152.32 | 229109.76 | Idle-Timeout |
2 | user1 | 2022-05-10 21:20:44 | 00:01:35:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 14080.0 | 195153.92 | 209233.92 | Idle-Timeout |
3 | user1 | 2022-05-11 00:37:42 | 00:00:26:00 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 5242.88 | 40806.4 | 46049.28 | Idle-Timeout |
4 | user1 | 2022-05-11 02:59:38 | 00:00:11:52 | 10.55.2.253 | 48:E7:DA:58:22:E9 | 22067.2 | 10772.48 | 32839.68 | Idle-Timeout |
df.isnull().sum()
name 0 start_time 0 usage_time 0 IP 0 MAC 0 upload 0 download 0 total_transfer 0 seession_break_reason 9 dtype: int64
# Data Imputation of seession_break_reason column
imp = SimpleImputer(strategy="most_frequent")
df['seession_break_reason'] = imp.fit_transform(df['seession_break_reason'].values.reshape(-1, 1))
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4712 entries, 0 to 4711 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 4712 non-null object 1 start_time 4712 non-null object 2 usage_time 4712 non-null object 3 IP 4712 non-null object 4 MAC 4712 non-null object 5 upload 4712 non-null object 6 download 4712 non-null object 7 total_transfer 4712 non-null float64 8 seession_break_reason 4712 non-null object dtypes: float64(1), object(8) memory usage: 331.4+ KB
df['name'].value_counts()
user4 727 user1 674 user6 674 user9 572 user7 528 user3 519 user2 457 user5 336 user8 225 Name: name, dtype: int64
# Data Encoding of MAC using labelEncoder()
label_encoder = preprocessing.LabelEncoder()
df['MAC']= label_encoder.fit_transform(df['MAC'])
df['MAC'].unique()
array([ 4, 0, 7, 18, 10, 32, 2, 15, 30, 3, 23, 22, 19, 12, 8, 24, 16, 20, 28, 5, 13, 26, 31, 21, 6, 9, 11, 17, 27, 29, 1, 14, 25])
sns.countplot(df['name'])
<AxesSubplot:xlabel='name', ylabel='count'>
print('The earliest start time is:' )
print(df.start_time.min())
print('The latest start time is:')
print(df.start_time.max())
The earliest start time is: 2022-05-09 22:52:41 The latest start time is: 2022-11-05 21:54:24
print('The minimum usage time is:')
print(df.usage_time.min())
print('The maximum usage time is:')
print(df.usage_time.max())
The minimum usage time is: 00:00:00:01 The maximum usage time is: 01:00:21:07
# Minimum Usage time for each user
minimum_usage_time = df.groupby('name').usage_time.min()
minimum_usage_time
name user1 00:00:00:18 user2 00:00:00:08 user3 00:00:00:01 user4 00:00:00:45 user5 00:00:01:07 user6 00:00:00:18 user7 00:00:00:20 user8 00:00:00:20 user9 00:00:00:09 Name: usage_time, dtype: object
# Maximum Usage time for each user
maximum_usage_time = df.groupby('name').usage_time.max()
maximum_usage_time
name user1 00:19:35:11 user2 00:20:39:52 user3 00:17:01:28 user4 01:00:21:07 user5 00:06:36:11 user6 00:19:35:11 user7 00:22:00:07 user8 00:17:24:26 user9 00:19:26:09 Name: usage_time, dtype: object
df['IP'].value_counts()
10.55.0.89 80 10.55.14.148 64 10.55.15.221 55 10.55.1.50 48 10.55.10.46 44 .. 10.55.14.67 1 10.55.7.44 1 10.55.12.225 1 10.55.12.190 1 10.55.2.33 1 Name: IP, Length: 1302, dtype: int64
df['MAC'].value_counts()
4 1236 25 561 12 481 22 465 31 362 24 242 20 237 15 231 1 189 21 147 32 128 0 104 3 73 5 70 19 54 14 36 26 23 17 12 28 12 23 11 2 7 30 7 13 5 10 4 16 3 9 2 6 2 18 2 7 2 11 1 27 1 29 1 8 1 Name: MAC, dtype: int64
max_upload = df.groupby('name').upload.min()
max_upload
name user1 1006.74 user2 10117.12 user3 10055.68 user4 10147.84 user5 10076.16 user6 1006.74 user7 10035.2 user8 10004.48 user9 100720.64 Name: upload, dtype: object
min_upload = df.groupby('name').upload.max()
min_upload
name user1 9932.8 user2 9994.24 user3 99860.48 user4 9963.52 user5 9973.76 user6 9932.8 user7 99860.48 user8 9922.56 user9 9963.52 Name: upload, dtype: object
max_download = df.groupby('name').download.max()
max_download
name user1 999393.28 user2 99676.16 user3 99635.2 user4 9953.28 user5 990812.16 user6 999393.28 user7 984227.84 user8 97884.16 user9 9984.0 Name: download, dtype: object
min_download = df.groupby('name').download.min()
min_download
name user1 100167.68 user2 10035.2 user3 100044.8 user4 100055.04 user5 10076.16 user6 100167.68 user7 100014.08 user8 101672.96 user9 1001216.0 Name: download, dtype: object
max_total_transfer = df.groupby('name').total_transfer.max()
max_total_transfer
name user1 24389877.76 user2 22051553.28 user3 3282042.88 user4 8524922.88 user5 5158993.92 user6 24389877.76 user7 28552724.48 user8 3166699.52 user9 6155141.12 Name: total_transfer, dtype: float64
min_total_transfer = df.groupby('name').total_transfer.min()
min_total_transfer
name user1 75.34 user2 73.13 user3 1.12 user4 13.45 user5 924.40 user6 75.34 user7 15.25 user8 18.08 user9 102.64 Name: total_transfer, dtype: float64
df['seession_break_reason'].value_counts()
Idle-Timeout 4359 Lost-Carrier 162 Lost-Service 124 User-Request 65 NAS-Reboot 2 Name: seession_break_reason, dtype: int64
sns.countplot(df['seession_break_reason'])
<AxesSubplot:xlabel='seession_break_reason', ylabel='count'>
plt.figure(figsize=(18, 9))
ax = sns.countplot(x='name', hue = 'seession_break_reason' , data = df )
ax.bar_label(ax.containers[0])
plt.title("Session break reason per User")
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>