import pandas as pd
import numpy as np
features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status", "Occupation", "Relationship",
"Race", "Sex", "Capital Gain", "Capital Loss", "Hours per week", "Country", "Target"]
df = pd.read_csv('adult.data', names=features)
df
Age | Workclass | fnlwgt | Education | Education-Num | Martial Status | Occupation | Relationship | Race | Sex | Capital Gain | Capital Loss | Hours per week | Country | Target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K |
32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K |
32561 rows × 15 columns
gender_counts = df['Sex'].value_counts()
print(gender_counts)
Male 21790 Female 10771 Name: Sex, dtype: int64
#women_data = df[(df['Sex'] == 'Female') & (df['Age'].notnull())]
age_avg = df['Age'].mean()
print(age_avg)
print(df['Age'].unique())
women_data = df[['Age','Sex']]
#women_data_age = df[['Age','Sex' == 'Female']]
#women_data_age
#df.pivot_table(df,index=['Sex'])
female_data = df[df['Sex'].str.contains('Female')]
women_age_avg = female_data['Age'].mean()
women_age_avg
38.58164675532078 [39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86 87]
36.85823043357163
German_citizen_data = df[df['Country'].str.contains('Germany')]
num_of_german = len(German_citizen_data)
#num_of_german
total_rows = len(df)
#total_rows
prportion_of_german_citizen = (num_of_german/total_rows)
prportion_of_german_citizen
object
0.004207487485028101
data_morethan_50 = df[df['Target'].str.contains('>50K')]
data_morethan_50
#df['Target'].unique()
Age | Workclass | fnlwgt | Education | Education-Num | Martial Status | Occupation | Relationship | Race | Sex | Capital Gain | Capital Loss | Hours per week | Country | Target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | 52 | Self-emp-not-inc | 209642 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 45 | United-States | >50K |
8 | 31 | Private | 45781 | Masters | 14 | Never-married | Prof-specialty | Not-in-family | White | Female | 14084 | 0 | 50 | United-States | >50K |
9 | 42 | Private | 159449 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 5178 | 0 | 40 | United-States | >50K |
10 | 37 | Private | 280464 | Some-college | 10 | Married-civ-spouse | Exec-managerial | Husband | Black | Male | 0 | 0 | 80 | United-States | >50K |
11 | 30 | State-gov | 141297 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | India | >50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32539 | 71 | ? | 287372 | Doctorate | 16 | Married-civ-spouse | ? | Husband | White | Male | 0 | 0 | 10 | United-States | >50K |
32545 | 39 | Local-gov | 111499 | Assoc-acdm | 12 | Married-civ-spouse | Adm-clerical | Wife | White | Female | 0 | 0 | 20 | United-States | >50K |
32554 | 53 | Private | 321865 | Masters | 14 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K |
7841 rows × 15 columns
mean_age = data_morethan_50['Age'].mean()
mean_age
44.24984058155847
standard_deviation = np.std(data_morethan_50['Age'])
standard_deviation
10.518356927661575
data_lessthan_50 = df[df['Target'].str.contains('<=50K')]
data_lessthan_50
Age | Workclass | fnlwgt | Education | Education-Num | Martial Status | Occupation | Relationship | Race | Sex | Capital Gain | Capital Loss | Hours per week | Country | Target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32553 | 32 | Private | 116138 | Masters | 14 | Never-married | Tech-support | Not-in-family | Asian-Pac-Islander | Male | 0 | 0 | 11 | Taiwan | <=50K |
32555 | 22 | Private | 310152 | Some-college | 10 | Never-married | Protective-serv | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K |
32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
24720 rows × 15 columns
mean_age_lessthan = data_lessthan_50['Age'].mean()
mean_age_lessthan
36.78373786407767
standard_deviation_lessthan = np.std(data_lessthan_50['Age'])
standard_deviation_lessthan
14.019804910115214
required_education_levels = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']
is_high_education = all(level in required_education_levels for level in data_morethan_50['Education'])
if is_high_education:
print("People who receive more than 50K have at least a high school education.")
else:
print("People who receive more than 50K do not necessarily have at least a high school education.")
People who receive more than 50K do not necessarily have at least a high school education.