import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline

# Define a colormap
cmap = plt.get_cmap('viridis')

# To avoid display of truncated columns
pd.options.display.max_columns=150


# Read the survey data into dataframe and view first five rows
fcc_survey = pd.read_csv('2017-fCC-New-Coders-Survey-Data.csv', low_memory=False)

# Display number of rows and columns in dataset
print(f'Total Rows in survey data: {fcc_survey.shape[0]}')
print(f'Total Columns in survey data: {fcc_survey.shape[1]}')

Total Rows in survey data: 18175
Total Columns in survey data: 136


fcc_survey.columns

Index(['Age', 'AttendedBootcamp', 'BootcampFinish', 'BootcampLoanYesNo',
       'BootcampName', 'BootcampRecommend', 'ChildrenNumber', 'CityPopulation',
       'CodeEventConferences', 'CodeEventDjangoGirls',
       ...
       'YouTubeFCC', 'YouTubeFunFunFunction', 'YouTubeGoogleDev',
       'YouTubeLearnCode', 'YouTubeLevelUpTuts', 'YouTubeMIT',
       'YouTubeMozillaHacks', 'YouTubeOther', 'YouTubeSimplilearn',
       'YouTubeTheNewBoston'],
      dtype='object', length=136)


# Find the percentage of null values in each column
round(fcc_survey.isnull().sum()*100/ len(fcc_survey), 1)

Age                    15.4
AttendedBootcamp        2.6
BootcampFinish         94.1
BootcampLoanYesNo      94.1
BootcampName           94.8
                       ... 
YouTubeMIT             81.7
YouTubeMozillaHacks    96.6
YouTubeOther           93.3
YouTubeSimplilearn     98.9
YouTubeTheNewBoston    83.7
Length: 136, dtype: float64


# Calculate non-null values of each column
cols_notnull_vals = fcc_survey.notnull().sum()

# Create a figure and axes
fig, ax = plt.subplots(figsize=(20, 5))

# Get the number of bars to plot
num_bars = len(cols_notnull_vals)

# Plot the non-null proportion of each column
bars = ax.bar(x=fcc_survey.columns, height=cols_notnull_vals,
              color=cmap(np.linspace(0, 1, num_bars)))

# Set title and axis labels
ax.set_title("Data Availability in FreeCodeCamp's New Coder Survey: A Visual Representation of Non-Null Values",
             weight='bold', size=20)
ax.set_xlabel('Survey questions response from 1 to 136', size=16)

# Remove spines
for side in ['top', 'right', 'left']:
    ax.spines[side].set_visible(False)

# Remove ticks and tick labels
ax.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)

plt.show()


# Display the first five rows of the dataset
fcc_survey.head()


# Display the names of all columns present in the dataset
print(fcc_survey.columns.tolist())

['Age', 'AttendedBootcamp', 'BootcampFinish', 'BootcampLoanYesNo', 'BootcampName', 'BootcampRecommend', 'ChildrenNumber', 'CityPopulation', 'CodeEventConferences', 'CodeEventDjangoGirls', 'CodeEventFCC', 'CodeEventGameJam', 'CodeEventGirlDev', 'CodeEventHackathons', 'CodeEventMeetup', 'CodeEventNodeSchool', 'CodeEventNone', 'CodeEventOther', 'CodeEventRailsBridge', 'CodeEventRailsGirls', 'CodeEventStartUpWknd', 'CodeEventWkdBootcamps', 'CodeEventWomenCode', 'CodeEventWorkshops', 'CommuteTime', 'CountryCitizen', 'CountryLive', 'EmploymentField', 'EmploymentFieldOther', 'EmploymentStatus', 'EmploymentStatusOther', 'ExpectedEarning', 'FinanciallySupporting', 'FirstDevJob', 'Gender', 'GenderOther', 'HasChildren', 'HasDebt', 'HasFinancialDependents', 'HasHighSpdInternet', 'HasHomeMortgage', 'HasServedInMilitary', 'HasStudentDebt', 'HomeMortgageOwe', 'HoursLearning', 'ID.x', 'ID.y', 'Income', 'IsEthnicMinority', 'IsReceiveDisabilitiesBenefits', 'IsSoftwareDev', 'IsUnderEmployed', 'JobApplyWhen', 'JobInterestBackEnd', 'JobInterestDataEngr', 'JobInterestDataSci', 'JobInterestDevOps', 'JobInterestFrontEnd', 'JobInterestFullStack', 'JobInterestGameDev', 'JobInterestInfoSec', 'JobInterestMobile', 'JobInterestOther', 'JobInterestProjMngr', 'JobInterestQAEngr', 'JobInterestUX', 'JobPref', 'JobRelocateYesNo', 'JobRoleInterest', 'JobWherePref', 'LanguageAtHome', 'MaritalStatus', 'MoneyForLearning', 'MonthsProgramming', 'NetworkID', 'Part1EndTime', 'Part1StartTime', 'Part2EndTime', 'Part2StartTime', 'PodcastChangeLog', 'PodcastCodeNewbie', 'PodcastCodePen', 'PodcastDevTea', 'PodcastDotNET', 'PodcastGiantRobots', 'PodcastJSAir', 'PodcastJSJabber', 'PodcastNone', 'PodcastOther', 'PodcastProgThrowdown', 'PodcastRubyRogues', 'PodcastSEDaily', 'PodcastSERadio', 'PodcastShopTalk', 'PodcastTalkPython', 'PodcastTheWebAhead', 'ResourceCodecademy', 'ResourceCodeWars', 'ResourceCoursera', 'ResourceCSS', 'ResourceEdX', 'ResourceEgghead', 'ResourceFCC', 'ResourceHackerRank', 'ResourceKA', 'ResourceLynda', 'ResourceMDN', 'ResourceOdinProj', 'ResourceOther', 'ResourcePluralSight', 'ResourceSkillcrush', 'ResourceSO', 'ResourceTreehouse', 'ResourceUdacity', 'ResourceUdemy', 'ResourceW3S', 'SchoolDegree', 'SchoolMajor', 'StudentDebtOwe', 'YouTubeCodeCourse', 'YouTubeCodingTrain', 'YouTubeCodingTut360', 'YouTubeComputerphile', 'YouTubeDerekBanas', 'YouTubeDevTips', 'YouTubeEngineeredTruth', 'YouTubeFCC', 'YouTubeFunFunFunction', 'YouTubeGoogleDev', 'YouTubeLearnCode', 'YouTubeLevelUpTuts', 'YouTubeMIT', 'YouTubeMozillaHacks', 'YouTubeOther', 'YouTubeSimplilearn', 'YouTubeTheNewBoston']


# Filter dataset with relevant columns
rel_fcc_survey = fcc_survey[['Age', 'AttendedBootcamp', 'BootcampLoanYesNo', 'CountryCitizen', 'CountryLive', 
                             'EmploymentField', 'EmploymentStatus', 'Gender', 'HasChildren', 
                             'HasDebt', 'HasFinancialDependents', 'HasHomeMortgage', 'HomeMortgageOwe', 
                             'HasStudentDebt', 'StudentDebtOwe', 'HoursLearning', 'Income', 'JobRoleInterest',
                             'MaritalStatus', 'MoneyForLearning', 'MonthsProgramming', 
                             'SchoolDegree', 'SchoolMajor']].copy()


round(rel_fcc_survey.isnull().sum()*100/len(rel_fcc_survey),0)

Age                       15.0
AttendedBootcamp           3.0
BootcampLoanYesNo         94.0
CountryCitizen            15.0
CountryLive               16.0
EmploymentField           55.0
EmploymentStatus          21.0
Gender                    15.0
HasChildren               79.0
HasDebt                   16.0
HasFinancialDependents    16.0
HasHomeMortgage           65.0
HomeMortgageOwe           92.0
HasStudentDebt            65.0
StudentDebtOwe            82.0
HoursLearning              8.0
Income                    58.0
JobRoleInterest           62.0
MaritalStatus             16.0
MoneyForLearning           9.0
MonthsProgramming          6.0
SchoolDegree              15.0
SchoolMajor               52.0
dtype: float64


# Display the job roles of 50 participants who responded
for i in range(50):
    if pd.notnull(rel_fcc_survey.loc[i, 'JobRoleInterest']):
        print(rel_fcc_survey.loc[i, 'JobRoleInterest'])

Full-Stack Web Developer
  Front-End Web Developer, Back-End Web Developer,   DevOps / SysAdmin,   Mobile Developer, Full-Stack Web Developer
  Front-End Web Developer, Full-Stack Web Developer, Back-End Web Developer
Full-Stack Web Developer, Information Security,   Mobile Developer,   Front-End Web Developer, Back-End Web Developer
Full-Stack Web Developer
Full-Stack Web Developer,   Quality Assurance Engineer, Game Developer, Back-End Web Developer,   User Experience Designer,   Front-End Web Developer
  DevOps / SysAdmin,   Data Scientist, Information Security, Data Engineer
Back-End Web Developer, Full-Stack Web Developer,   Front-End Web Developer
Full-Stack Web Developer
Full-Stack Web Developer
Full-Stack Web Developer
Full-Stack Web Developer,   Front-End Web Developer,   User Experience Designer, Back-End Web Developer
  Front-End Web Developer,   Mobile Developer, Game Developer, Full-Stack Web Developer
Information Security
Full-Stack Web Developer
Back-End Web Developer
Full-Stack Web Developer
  Front-End Web Developer,   Data Scientist, Full-Stack Web Developer,   Mobile Developer
Back-End Web Developer, Full-Stack Web Developer
  Front-End Web Developer
  Data Scientist, Information Security, Data Engineer
Full-Stack Web Developer,   Quality Assurance Engineer
Back-End Web Developer, Full-Stack Web Developer
Back-End Web Developer, Full-Stack Web Developer,   Front-End Web Developer,   Data Scientist
  Mobile Developer,   Product Manager
  Front-End Web Developer, Back-End Web Developer, Full-Stack Web Developer,   Mobile Developer,   Data Scientist, Information Security
  Front-End Web Developer
Full-Stack Web Developer
Back-End Web Developer,   Front-End Web Developer, Full-Stack Web Developer


print(f'Total number of coders who responded for their job role interests: \
    {rel_fcc_survey.JobRoleInterest.notnull().sum()}')
print(f'Total number of coders who responded for their job role interests (%): \
    {round(rel_fcc_survey.JobRoleInterest.notnull().sum()/len(rel_fcc_survey)*100)}')

Total number of coders who responded for their job role interests:     6992
Total number of coders who responded for their job role interests (%):     38


rel_fcc_survey['JobRoleInterest'] = (rel_fcc_survey.JobRoleInterest
                                       .str.replace(r'  |-', ' ', regex=True) # replace double whitespaces and hypens with single whitespace
                                       .str.replace(r',\s+', ',', regex=True) # replace double whitespaces after ',' with single whitespace
                                       .str.strip()
                                       .str.lower()
                                       .str.split(','))


rel_fcc_survey['JobRoleInterest'].dropna().head(5)

1                           [full stack web developer]
2    [front end web developer, back end web develop...
3    [front end web developer, full stack web devel...
4    [full stack web developer, information securit...
6                           [full stack web developer]
Name: JobRoleInterest, dtype: object


# Display the job roles after the transformation (only non-null rows)
for i in range(50):
    if isinstance(rel_fcc_survey.loc[i, 'JobRoleInterest'], list) and len(rel_fcc_survey.loc[i, 'JobRoleInterest']) > 0:
        print(rel_fcc_survey.loc[i, 'JobRoleInterest'])

# compare difference below
# rel_fcc_survey['JobRoleInterest'].dropna().head(50)

['full stack web developer']
['front end web developer', 'back end web developer', 'devops / sysadmin', 'mobile developer', 'full stack web developer']
['front end web developer', 'full stack web developer', 'back end web developer']
['full stack web developer', 'information security', 'mobile developer', 'front end web developer', 'back end web developer']
['full stack web developer']
['full stack web developer', 'quality assurance engineer', 'game developer', 'back end web developer', 'user experience designer', 'front end web developer']
['devops / sysadmin', 'data scientist', 'information security', 'data engineer']
['back end web developer', 'full stack web developer', 'front end web developer']
['full stack web developer']
['full stack web developer']
['full stack web developer']
['full stack web developer', 'front end web developer', 'user experience designer', 'back end web developer']
['front end web developer', 'mobile developer', 'game developer', 'full stack web developer']
['information security']
['full stack web developer']
['back end web developer']
['full stack web developer']
['front end web developer', 'data scientist', 'full stack web developer', 'mobile developer']
['back end web developer', 'full stack web developer']
['front end web developer']
['data scientist', 'information security', 'data engineer']
['full stack web developer', 'quality assurance engineer']
['back end web developer', 'full stack web developer']
['back end web developer', 'full stack web developer', 'front end web developer', 'data scientist']
['mobile developer', 'product manager']
['front end web developer', 'back end web developer', 'full stack web developer', 'mobile developer', 'data scientist', 'information security']
['front end web developer']
['full stack web developer']
['back end web developer', 'front end web developer', 'full stack web developer']


# Count number of subjects/roles the coders has showns interest
roles_count = (
    rel_fcc_survey.JobRoleInterest.apply(lambda x: len(x)
                                         if x is not np.nan else np.nan).value_counts(normalize=True).mul(100).round(2)
)

roles_count

1.0     31.65
3.0     15.89
4.0     15.22
5.0     12.04
2.0     10.88
6.0      6.72
7.0      3.86
8.0      1.76
9.0      0.99
10.0     0.47
12.0     0.30
11.0     0.19
13.0     0.03
Name: JobRoleInterest, dtype: float64


roles_dict = {}
for roles in rel_fcc_survey.JobRoleInterest:
    if roles is not np.nan:
        for role in roles:
            if role in roles_dict:
                roles_dict[role] += 1
            else:
                roles_dict[role] = 1

top_roles = pd.DataFrame(roles_dict.items(), columns=['RoleName', 'Count'])

# Compute Contribution of each subject percentage wise
top_roles['Count%'] = ((top_roles.Count/sum(top_roles.Count)).mul(100).round(2))

# Select only top 10 most sought out courses
top_roles = top_roles.sort_values(by='Count%', ascending=False).head(10).reset_index(drop=True)
top_roles


# Create figure and axes
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))

# Get the number of bars to plots
num_bars1 = len(roles_count)
num_bars2 = len(top_roles)

# Plot number of courses
bars1 = ax1.bar(x=roles_count.index.astype(int), height=roles_count, color=cmap(np.linspace(0, 1, num_bars1)))

# Plot most preferred courses
bars2 = ax2.barh(y=top_roles['RoleName'], width=top_roles['Count%'], color=cmap(np.linspace(0, 1, num_bars2)))

# Invert the y-axis for the second plot
ax2.invert_yaxis()

# Set titles and axis labels for both plots
ax1.set_title('No. of Roles Selected vs. % of Coders Interests', weight='bold', size=18)
ax1.set_xlabel('Number of Roles Selected', size=14)
ax1.set_ylabel('Coders Percentage (%)', size=14)
ax1.tick_params(axis='x', labelsize=14)
ax1.tick_params(axis='y', labelsize=14)

ax2.set_title('10 Most Preferred Roles', weight='bold', size=18)
ax2.set_xlabel('Coders Interests (%)', size=14)
ax2.tick_params(axis='x', labelsize=14)
ax2.tick_params(axis='y', labelsize=14)

# Remove spines from both plots
for ax in [ax1, ax2]:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_linewidth(0.5)

# Add values to bars
for bar in bars1:
    height = bar.get_height()
    ax1.annotate(f'{height}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', size=11)
    
for bar in bars2:
    width = bar.get_width()
    ax2.annotate(f'{width:.1f}%', xy=(width, bar.get_y() + bar.get_height() / 2),
                 xytext=(3, 0), textcoords="offset points", ha='left', va='center', size=12)
    
# Set common title for both plots
fig.suptitle('Distribution of Preferred Roles among Coders', weight='bold', size=20)

plt.tight_layout()
plt.show()


def search_web_mob(job_list):
    # reuse the compiled regular expression object multiple times, which can improve performance
    pattern = re.compile(r'web\s?developer|mobile\s?developer')
    if job_list is not np.nan:
        # web_mob_list = list(filter(pattern.search, job_list))
        web_mob_list = [s for s in job_list if pattern.search(s)]
        if len(web_mob_list) > 0:
            return 'web or mobile'
        else:
            return 'others'
    
rel_fcc_survey['WebOrMobDev'] = rel_fcc_survey.JobRoleInterest.apply(search_web_mob)


# Calculate the frequency to coders interest in percentage
jobs_percent = rel_fcc_survey.WebOrMobDev.value_counts(normalize=True)
# Extract frequency indexes (['web or mobile', 'others'])
jobs_freq_indexes = rel_fcc_survey.WebOrMobDev.value_counts().index

# Create horizontal bar plot
fig, ax = plt.subplots(figsize=(10, 2))
ax.barh(y=jobs_freq_indexes, width=jobs_percent, color='tab:blue', alpha=0.8)

# Set title and axis labels
ax.set_title('Coders interested in Web or Mobile Development', weight='bold', size=20)
ax.set_xlabel('% of Coders', size=14)
ax.set_xticks([])
ax.tick_params(axis='y', labelsize=12, left=False)

# Disable spines
for side in ['top', 'right', 'left']:
    ax.spines[side].set_visible(False)

# Add bar values in the center of each bar in white color
for i, v in enumerate(jobs_percent):
    ax.text(v/2, i, str(round(v*100, 2))+'%', color='white', va='center', ha='center', fontweight='bold', size=12)

plt.show()


# Drop rows where coders haven't reponded to preferred job role(s)
rel_fcc_survey.dropna(subset=['JobRoleInterest'], inplace=True)


# Keep only not-null values of the `JobRoleInterest` column
good_fcc_survey = rel_fcc_survey[rel_fcc_survey.JobRoleInterest.notnull()].copy()

# Check results (should output 0)
good_fcc_survey.JobRoleInterest.isnull().sum()

0


def create_freq_table(df, column):
    # Calculate abs and rel frequencies
    abs_freq = df[column].value_counts()
    rel_freq = df[column].value_counts(normalize=True).mul(100).round()
    
    return abs_freq, rel_freq

# Call the function to access absolute and relative frequencies
abs_freq, rel_freq = create_freq_table(good_fcc_survey, 'CountryLive')

# Store lists of values in dataframe
freq_table = pd.DataFrame({'Absolute Frequency': abs_freq,
                           'Relative Frequency': rel_freq})

# View top 10 results
freq_table.head(10)


# Calculate counts of countries with relative frequency above or below 2%
above_2_count = (freq_table['Relative Frequency'] >= 2.0).sum()
below_2_count = (freq_table['Relative Frequency'] < 2.0).sum()

# Create a horizontal bar plot
fig, ax = plt.subplots(figsize=(10, 2))
ax.barh(y=['2% or above', 'Below 2%'],
        width=[above_2_count, below_2_count],
        color='tab:blue', alpha=0.8)

ax.set_title('Countries with 2% or More Respondents', weight='bold', size=20)
ax.tick_params(axis='y', labelsize=12, left=False)
ax.set_xticks([])
ax.set_xlabel('Count', size=14)

# Display bar values
for i, v in enumerate([above_2_count, below_2_count]):
    ax.text(v * 0.5, i, int(v), size='large', color='white', fontweight='bold', ha='center', va='center')

# Disable spines
for side in ['top', 'right', 'left']:
    ax.spines[side].set_visible(False)

plt.show()


# Filter the data for countries with relative frequency of 2% or above
freq_table_filtered = freq_table[freq_table['Relative Frequency'] >= 2.0]

# Create the horizontal bar plot
fig, ax = plt.subplots(figsize=(10, 6))

# Get the number of bars to plot
num_bars = len(freq_table_filtered)

bars = ax.barh(y=freq_table_filtered.index, width=freq_table_filtered['Relative Frequency'],
               color=cmap(np.linspace(0, 1, num_bars)), height=0.5, alpha=0.9, edgecolor='white')

# Add value labels
for bar in bars:
    width = bar.get_width()
    plt.text(width + 1.7, bar.get_y() + 0.25, f'{width:.1f}%', ha='center', va='center', fontsize=12)

# Add title and axis labels
ax.set_title('Distribution of Coders among Top 8 Countries', weight='bold', size=20)
ax.set_xlabel('Percentage of Coders', size=14)

# Set x-axis tick format to percentage
ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))

# Invert the y-axis to show the bars in descending order
ax.invert_yaxis()

# Remove spines
for side in ['top', 'right', 'bottom', 'left']:
    ax.spines[side].set_visible(False)

plt.show()


# Filter 'CountryLive' column for the top 4 countries ('United States of America', 'India', 'United Kingdom', and 'Canada') 
top_4 = ['United States of America', 'India', 'United Kingdom', 'Canada']
target_fcc_survey = good_fcc_survey[good_fcc_survey['CountryLive'].isin(top_4)].copy()

# Replace 0 with 1 in 'MonthsProgramming'
target_fcc_survey['MonthsProgramming'] = target_fcc_survey['MonthsProgramming'].replace(0, 1)

# Divide 'MoneyForLearning' by 'MonthsProgramming' and create new column to store results
target_fcc_survey['SpendingPerMonth'] = target_fcc_survey['MoneyForLearning'] / target_fcc_survey['MonthsProgramming']

# View results
target_fcc_survey[['MoneyForLearning', 'MonthsProgramming', 'SpendingPerMonth']].head()


col_null_before = target_fcc_survey.SpendingPerMonth.isnull().sum()
print(f'Null values in SpendingPerMonth before dropping rows: {col_null_before}')

# Drop rows where the value is null in 'SpendingPerMonth'
target_fcc_survey = target_fcc_survey.dropna(subset='SpendingPerMonth')

col_null_after = target_fcc_survey.SpendingPerMonth.isnull().sum()
print(f'Null values in SpendingPerMonth after dropping rows: {col_null_after}')

Null values in SpendingPerMonth before dropping rows: 313
Null values in SpendingPerMonth after dropping rows: 0


# Find 'mean' and 'median' of each country based on student spending
student_spending = target_fcc_survey.groupby('CountryLive')['SpendingPerMonth'].agg(['count','mean', 'median']).round()

# Sort values by 'mean' in descending order
student_spending = student_spending.sort_values('mean', ascending=False)
student_spending


# Create the horizontal bar plot
fig, ax = plt.subplots(figsize=(10, 5))

# Get the number of bars to plot
num_bars = len(student_spending)

bars = ax.barh(y=student_spending.index, width=student_spending['mean'],
               color=cmap(np.linspace(0, 1, num_bars)), height=0.3, alpha=0.9, edgecolor='white')

# Add value labels
for bar in bars:
    width = bar.get_width()
    plt.text(width + 1, bar.get_y() + 0.15, f'${width:.0f}', ha='left', va='center', fontsize=12)

# Add title and axis labels
ax.set_title('Average Monthly Spending Per Student', weight='bold', size=20)
ax.set_xlabel('Average Spending (USD)', size=14)

# Invert the y-axis to show the bars in descending order
ax.invert_yaxis()

# Remove spines
for side in ['top', 'right', 'bottom', 'left']:
    ax.spines[side].set_visible(False)

plt.show()


def create_box_plot(df=target_fcc_survey,
                    column='CountryLive',
                    title='Monthly Money Spending Per Country',
                    xlabel=''):
    '''
    Creating box plots to visualize the distribution of the 'SpendingPerMonth' column by countries
    '''
    # Define figure and axis
    fig, ax = plt.subplots(figsize=(10, 4))
    
    # Define custom properties for the boxplot
    boxprops = dict(linewidth=2)
    whiskerprops = dict(linewidth=2)
    capprops = dict(linewidth=2)
    medianprops = dict(linestyle='-', linewidth=2.5, color='orange')
    meanpointprops = dict(marker='o', markersize=8, markerfacecolor='lime', markeredgecolor='black')
    
    # Create boxplot
    ax.boxplot([df.loc[df[column] == value, 'SpendingPerMonth'] for value in df[column].unique()],
               labels=df[column].unique(),
               showmeans=True,
               meanprops=meanpointprops,
               boxprops=boxprops,
               whiskerprops=whiskerprops,
               capprops=capprops,
               medianprops=medianprops,
               flierprops=dict(marker='o', markersize=3))
    
    # Add title and labels
    ax.set_title(title, weight='bold', size=20)
    ax.tick_params(axis='both', labelsize=10, bottom=False)
    ax.set_xlabel(xlabel, size=14)
    ax.set_ylabel('Spending Per Money ($)', size=14)
    ax.set_ylim(0, None)
    
    # Remove spines
    for side in ['top', 'right', 'bottom']:
        ax.spines[side].set_visible(False)
    
    plt.show()

# Create box plots for the 'SpendingPerMonth' column by countries
create_box_plot()


# Filter the values of the 'SpendingPerMonth' column smaller than $20,000
filter_fcc = target_fcc_survey[target_fcc_survey['SpendingPerMonth'] < 20000].copy()

# Recalculate the 'mean' of each country based on student spending per month
recal_student_spending = filter_fcc.groupby('CountryLive')['SpendingPerMonth'].agg(['mean']).round()
recal_student_spending = recal_student_spending.sort_values('mean', ascending=False)
recal_student_spending


# Create new box plots to visualize the updated distribution (filter_fcc)
create_box_plot(df=filter_fcc)


# Function to return dataframe containing the outliers
def find_outliers(country: str, spending: int):
    return filter_fcc[(filter_fcc['CountryLive'] == country) & (filter_fcc['SpendingPerMonth'] >= spending)]

# Find outliers for India
india_outliers = find_outliers('India', 2500)
india_outliers


# Drop rows of outliers for India
filter_fcc = filter_fcc.drop(india_outliers.index).copy()


us_outliers = find_outliers('United States of America', 6000)
us_outliers


# Remove the respondents from the US who didn't attendent a bootcamp
us_no_bootcamp = filter_fcc[(filter_fcc['CountryLive'] == 'United States of America') &
                            (filter_fcc['SpendingPerMonth'] >= 6000) &
                            (filter_fcc['AttendedBootcamp'] == 0)]

filter_fcc = filter_fcc.drop(us_no_bootcamp.index).copy()

# Remove the respondents from the US who have been programming for 3 months or less
us_less_than_3_months = filter_fcc[(filter_fcc['CountryLive'] == 'United States of America') &
                            (filter_fcc['SpendingPerMonth'] >= 6000) &
                            (filter_fcc['MonthsProgramming'] <= 3)]

filter_fcc = filter_fcc.drop(us_less_than_3_months.index).copy()


canada_outliers = find_outliers('Canada', 5000)
canada_outliers


# Drop rows of outliers for Canada
filter_fcc = filter_fcc.drop(canada_outliers.index)


# Recalculate the 'mean' of each country based on student spending per month
recal_stud_spending = filter_fcc.groupby('CountryLive')['SpendingPerMonth'].agg(['mean']).round().copy()
recal_stud_spending = recal_stud_spending.sort_values('mean', ascending=False)
recal_stud_spending


# Create new box plots (filter_fcc)
create_box_plot(df=filter_fcc)


# Frequency table for the 'CountryLive' column
round(filter_fcc['CountryLive'].value_counts(normalize=True)*100)

United States of America    75.0
India                       12.0
United Kingdom               7.0
Canada                       6.0
Name: CountryLive, dtype: float64

	Age	BootcampFinish	BootcampLoanYesNo	BootcampName	BootcampRecommend	ChildrenNumber	CityPopulation	CodeEventConferences	CodeEventDjangoGirls	CodeEventFCC	CodeEventGameJam	CodeEventGirlDev	CodeEventHackathons	CodeEventMeetup	CodeEventNodeSchool	CodeEventNone	CodeEventOther	CodeEventRailsBridge	CodeEventRailsGirls	CodeEventStartUpWknd	CodeEventWkdBootcamps	CodeEventWomenCode	CodeEventWorkshops	CommuteTime	CountryCitizen	CountryLive	EmploymentField	EmploymentFieldOther	EmploymentStatus	EmploymentStatusOther	ExpectedEarning	FinanciallySupporting	FirstDevJob	Gender	GenderOther	HasChildren	HasDebt	HasFinancialDependents	HasHighSpdInternet	HasHomeMortgage	HasStudentDebt	HomeMortgageOwe	HoursLearning	ID.x	ID.y	Income	IsEthnicMinority	IsUnderEmployed	JobApplyWhen	JobInterestBackEnd	JobInterestDataEngr	JobInterestDataSci	JobInterestDevOps	JobInterestFrontEnd	JobInterestFullStack	JobInterestGameDev	JobInterestInfoSec	JobInterestMobile	JobInterestOther	JobInterestProjMngr	JobInterestQAEngr	JobInterestUX	JobPref	JobRelocateYesNo	JobRoleInterest	JobWherePref	LanguageAtHome	MaritalStatus	MoneyForLearning	MonthsProgramming	NetworkID	Part1EndTime	Part1StartTime	Part2EndTime	Part2StartTime	PodcastChangeLog	PodcastCodeNewbie	PodcastCodePen	PodcastDevTea	PodcastDotNET	PodcastGiantRobots	PodcastJSAir	PodcastJSJabber	PodcastNone	PodcastOther	PodcastProgThrowdown	PodcastRubyRogues	PodcastSEDaily	PodcastSERadio	PodcastShopTalk	PodcastTalkPython	PodcastTheWebAhead	ResourceCodecademy	ResourceCodeWars	ResourceCoursera	ResourceCSS	ResourceEdX	ResourceEgghead	ResourceFCC	ResourceHackerRank	ResourceKA	ResourceLynda	ResourceMDN	ResourceOdinProj	ResourceOther	ResourcePluralSight	ResourceSkillcrush	ResourceSO	ResourceTreehouse	ResourceUdacity	ResourceUdemy	ResourceW3S	SchoolDegree	SchoolMajor	StudentDebtOwe	YouTubeCodeCourse	YouTubeCodingTrain	YouTubeCodingTut360	YouTubeComputerphile	YouTubeDerekBanas	YouTubeDevTips	YouTubeEngineeredTruth	YouTubeFCC	YouTubeFunFunFunction	YouTubeGoogleDev	YouTubeLearnCode	YouTubeLevelUpTuts	YouTubeMIT	YouTubeMozillaHacks	YouTubeOther	YouTubeSimplilearn	YouTubeTheNewBoston
0	27.0	NaN	NaN	NaN	NaN	NaN	more than 1 million	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	15 to 29 minutes	Canada	Canada	software development and IT	NaN	Employed for wages	NaN	NaN	NaN	NaN	female	NaN	NaN	1.0	0.0	1.0	0.0	0.0	NaN	15.0	02d9465b21e8bd09374b0066fb2d5614	eb78c1c3ac6cd9052aec557065070fbf	NaN	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	start your own business	NaN	NaN	NaN	English	married or domestic partnership	150.0	6.0	6f1fbc6b2b	2017-03-09 00:36:22	2017-03-09 00:32:59	2017-03-09 00:59:46	2017-03-09 00:36:26	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	some college credit, no degree	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	34.0	NaN	NaN	NaN	NaN	NaN	less than 100,000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	United States of America	United States of America	NaN	NaN	Not working but looking for work	NaN	35000.0	NaN	NaN	male	NaN	NaN	1.0	0.0	1.0	0.0	1.0	NaN	10.0	5bfef9ecb211ec4f518cfc1d2a6f3e0c	21db37adb60cdcafadfa7dca1b13b6b1	NaN	0.0	NaN	Within 7 to 12 months	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	work for a nonprofit	1.0	Full-Stack Web Developer	in an office with other developers	English	single, never married	80.0	6.0	f8f8be6910	2017-03-09 00:37:07	2017-03-09 00:33:26	2017-03-09 00:38:59	2017-03-09 00:37:10	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	1.0	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	1.0	1.0	some college credit, no degree	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	21.0	NaN	NaN	NaN	NaN	NaN	more than 1 million	NaN	NaN	NaN	NaN	NaN	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	15 to 29 minutes	United States of America	United States of America	software development and IT	NaN	Employed for wages	NaN	70000.0	NaN	NaN	male	NaN	NaN	0.0	0.0	1.0	NaN	NaN	NaN	25.0	14f1863afa9c7de488050b82eb3edd96	21ba173828fbe9e27ccebaf4d5166a55	13000.0	1.0	0.0	Within 7 to 12 months	1.0	NaN	NaN	1.0	1.0	1.0	NaN	NaN	1.0	NaN	NaN	NaN	NaN	work for a medium-sized company	1.0	Front-End Web Developer, Back-End Web Develo...	no preference	Spanish	single, never married	1000.0	5.0	2ed189768e	2017-03-09 00:37:58	2017-03-09 00:33:53	2017-03-09 00:40:14	2017-03-09 00:38:02	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	Codenewbie	NaN	NaN	NaN	NaN	1.0	NaN	NaN	1.0	NaN	NaN	1.0	NaN	NaN	1.0	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	high school diploma or equivalent (GED)	NaN	NaN	NaN	NaN	1.0	NaN	1.0	1.0	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN	NaN	NaN	NaN
3	26.0	NaN	NaN	NaN	NaN	NaN	between 100,000 and 1 million	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	I work from home	Brazil	Brazil	software development and IT	NaN	Employed for wages	NaN	40000.0	0.0	NaN	male	NaN	0.0	1.0	1.0	1.0	1.0	0.0	40000.0	14.0	91756eb4dc280062a541c25a3d44cfb0	3be37b558f02daae93a6da10f83f0c77	24000.0	0.0	1.0	Within the next 6 months	1.0	NaN	NaN	NaN	1.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	work for a medium-sized company	NaN	Front-End Web Developer, Full-Stack Web Deve...	from home	Portuguese	married or domestic partnership	0.0	5.0	dbdc0664d1	2017-03-09 00:40:13	2017-03-09 00:37:45	2017-03-09 00:42:26	2017-03-09 00:40:18	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	some college credit, no degree	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	1.0	1.0	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN
4	20.0	NaN	NaN	NaN	NaN	NaN	between 100,000 and 1 million	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Portugal	Portugal	NaN	NaN	Not working but looking for work	NaN	140000.0	NaN	NaN	female	NaN	NaN	0.0	0.0	1.0	NaN	NaN	NaN	10.0	aa3f061a1949a90b27bef7411ecd193f	d7c56bbf2c7b62096be9db010e86d96d	NaN	0.0	NaN	Within 7 to 12 months	1.0	NaN	NaN	NaN	1.0	1.0	NaN	1.0	1.0	NaN	NaN	NaN	NaN	work for a multinational corporation	1.0	Full-Stack Web Developer, Information Security...	in an office with other developers	Portuguese	single, never married	0.0	24.0	11b0f2d8a9	2017-03-09 00:42:45	2017-03-09 00:39:44	2017-03-09 00:45:42	2017-03-09 00:42:50	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	bachelor's degree	Information Technology	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	RoleName	Count	Count%
0	full stack web developer	4198	18.58
1	front end web developer	3533	15.63
2	back end web developer	2772	12.27
3	mobile developer	2305	10.20
4	data scientist	1643	7.27
5	game developer	1628	7.20
6	user experience designer	1469	6.50
7	information security	1326	5.87
8	data engineer	1248	5.52
9	devops / sysadmin	927	4.10

	Age	BootcampLoanYesNo	CountryCitizen	CountryLive	EmploymentField	EmploymentStatus	Gender	HasChildren	HasDebt	HasFinancialDependents	HasHomeMortgage	HomeMortgageOwe	HasStudentDebt	StudentDebtOwe	HoursLearning	Income	JobRoleInterest	MaritalStatus	MoneyForLearning	MonthsProgramming	SchoolDegree	SchoolMajor	WebOrMobDev	SpendingPerMonth
1728	24.0	NaN	India	India	NaN	A stay-at-home parent or homemaker	male	NaN	0.0	0.0	NaN	NaN	NaN	NaN	30.0	NaN	[user experience designer, mobile developer, p...	single, never married	20000.0	4.0	bachelor's degree	Computer Programming	web or mobile	5000.000000
1755	20.0	NaN	India	India	NaN	Not working and not looking for work	male	NaN	0.0	0.0	NaN	NaN	NaN	NaN	10.0	NaN	[information security, full stack web develope...	single, never married	50000.0	15.0	bachelor's degree	Computer Science	web or mobile	3333.333333
7989	28.0	NaN	India	India	software development and IT	Employed for wages	male	0.0	1.0	1.0	0.0	NaN	1.0	2500.0	20.0	300000.0	[user experience designer, back end web develo...	married or domestic partnership	5000.0	1.0	bachelor's degree	Aerospace and Aeronautical Engineering	web or mobile	5000.000000
8126	22.0	NaN	India	India	NaN	Not working but looking for work	male	NaN	1.0	0.0	0.0	NaN	1.0	10000.0	80.0	NaN	[back end web developer, full stack web develo...	single, never married	5000.0	1.0	bachelor's degree	Electrical and Electronics Engineering	web or mobile	5000.000000
13398	19.0	NaN	India	India	NaN	Unable to work	male	NaN	0.0	0.0	NaN	NaN	NaN	NaN	30.0	NaN	[mobile developer]	single, never married	20000.0	2.0	bachelor's degree	Computer Science	web or mobile	10000.000000
15587	27.0	NaN	India	India	software development and IT	Employed for wages	male	0.0	1.0	1.0	0.0	NaN	1.0	25000.0	36.0	60000.0	[full stack web developer, data scientist]	single, never married	100000.0	24.0	bachelor's degree	Communications	web or mobile	4166.666667

	Age	AttendedBootcamp	BootcampLoanYesNo	CountryCitizen	CountryLive	EmploymentField	EmploymentStatus	Gender	HasChildren	HasDebt	HasFinancialDependents	HasHomeMortgage	HomeMortgageOwe	HasStudentDebt	StudentDebtOwe	HoursLearning	Income	JobRoleInterest	MaritalStatus	MoneyForLearning	MonthsProgramming	SchoolDegree	SchoolMajor	WebOrMobDev	SpendingPerMonth
718	26.0	1.0	0.0	United States of America	United States of America	architecture or physical engineering	Employed for wages	male	NaN	0.0	0.0	NaN	NaN	NaN	NaN	35.0	44500.0	[user experience designer, full stack web deve...	single, never married	8000.0	1.0	bachelor's degree	Architecture	web or mobile	8000.000000
1222	32.0	1.0	0.0	United States of America	United States of America	NaN	Not working and not looking for work	female	NaN	1.0	0.0	0.0	NaN	0.0	NaN	50.0	NaN	[front end web developer, mobile developer, us...	single, never married	13000.0	2.0	bachelor's degree	Anthropology	web or mobile	6500.000000
3184	34.0	1.0	0.0	NaN	United States of America	software development and IT	Employed for wages	male	NaN	0.0	0.0	NaN	NaN	NaN	NaN	10.0	40000.0	[quality assurance engineer, devops / sysadmin...	single, never married	9000.0	1.0	some college credit, no degree	NaN	others	9000.000000
3930	31.0	0.0	NaN	United States of America	United States of America	NaN	Not working and not looking for work	male	NaN	1.0	0.0	0.0	NaN	1.0	40000.0	50.0	NaN	[devops / sysadmin, front end web developer, f...	married or domestic partnership	65000.0	6.0	bachelor's degree	Biology	web or mobile	10833.333333
6805	46.0	1.0	1.0	United States of America	United States of America	NaN	Not working but looking for work	male	NaN	1.0	0.0	0.0	NaN	1.0	45000.0	45.0	NaN	[full stack web developer, game developer, pro...	married or domestic partnership	15000.0	1.0	bachelor's degree	Business Administration and Management	web or mobile	15000.000000
7198	32.0	0.0	NaN	United States of America	United States of America	education	Employed for wages	male	NaN	1.0	0.0	0.0	NaN	1.0	NaN	4.0	NaN	[full stack web developer, back end web develo...	single, never married	70000.0	5.0	professional degree (MBA, MD, JD, etc.)	Computer Science	web or mobile	14000.000000
7505	26.0	1.0	1.0	United States of America	United States of America	NaN	Not working but looking for work	male	NaN	1.0	0.0	0.0	NaN	1.0	20000.0	40.0	NaN	[mobile developer, full stack web developer, i...	single, never married	20000.0	3.0	bachelor's degree	Economics	web or mobile	6666.666667
9778	33.0	1.0	1.0	United States of America	United States of America	education	Employed for wages	male	NaN	1.0	0.0	0.0	NaN	1.0	45000.0	40.0	20000.0	[full stack web developer, data engineer, qual...	single, never married	8000.0	1.0	master's degree (non-professional)	Chemical Engineering	web or mobile	8000.000000
16650	29.0	0.0	NaN	United States of America	United States of America	NaN	Not working but looking for work	male	1.0	1.0	1.0	1.0	400000.0	1.0	30000.0	40.0	NaN	[product manager, data engineer, full stack we...	married or domestic partnership	200000.0	12.0	associate's degree	Computer Programming	web or mobile	16666.666667
16997	27.0	0.0	NaN	United States of America	United States of America	health care	Employed for wages	female	1.0	1.0	1.0	0.0	NaN	1.0	12500.0	12.0	40000.0	[mobile developer, game developer, user experi...	single, never married	12500.0	1.0	some college credit, no degree	NaN	web or mobile	12500.000000
17231	50.0	0.0	NaN	Kenya	United States of America	NaN	Not working but looking for work	female	1.0	0.0	1.0	NaN	NaN	NaN	NaN	1.0	NaN	[front end web developer]	married or domestic partnership	30000.0	2.0	bachelor's degree	Computer Programming	web or mobile	15000.000000

Finding the Best Two Markets to Advertise In¶

Introduction¶

Understanding the Data¶

Checking for Sample Representativity¶

Improve readability of the `JobRoleInterest` column¶

Coders Interested in Web Development or Mobile Development Roles¶

New Coders - Locations and Densities¶

Spending Money for Learning¶

Dealing with Extreme Outliers¶

Mean¶

Median¶

Examine outliers for India¶

Examine outliers for USA¶

Examine outliers for Canada¶

Choosing the Two Best Markets¶

Conclusion¶

	Absolute Frequency	Relative Frequency
United States of America	3125	46.0
India	528	8.0
United Kingdom	315	5.0
Canada	260	4.0
Poland	131	2.0
Brazil	129	2.0
Germany	125	2.0
Australia	112	2.0
Russia	102	1.0
Ukraine	89	1.0

	MoneyForLearning	MonthsProgramming	SpendingPerMonth
1	80.0	6.0	13.333333
2	1000.0	5.0	200.000000
6	0.0	12.0	0.000000
13	NaN	NaN	NaN
14	NaN	9.0	NaN

	count	mean	median
CountryLive
United States of America	2933	228.0	3.0
India	463	135.0	0.0
Canada	240	114.0	0.0
United Kingdom	279	46.0	0.0

	mean
CountryLive
United States of America	184.0
India	135.0
Canada	114.0
United Kingdom	46.0

	mean
CountryLive
United States of America	143.0
Canada	93.0
India	66.0
United Kingdom	46.0

Finding the Best Two Markets to Advertise In¶

Introduction¶

Understanding the Data¶

Checking for Sample Representativity¶

Improve readability of the JobRoleInterest column¶

Coders Interested in Web Development or Mobile Development Roles¶

New Coders - Locations and Densities¶

Spending Money for Learning¶

Dealing with Extreme Outliers¶

Mean¶

Median¶

Examine outliers for India¶

Examine outliers for USA¶

Examine outliers for Canada¶

Choosing the Two Best Markets¶

Conclusion¶

Improve readability of the `JobRoleInterest` column¶