import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)


jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


jeopardy.columns = jeopardy.columns.str.strip()
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Show Number  19999 non-null  int64 
 1   Air Date     19999 non-null  object
 2   Round        19999 non-null  object
 3   Category     19999 non-null  object
 4   Value        19999 non-null  object
 5   Question     19999 non-null  object
 6   Answer       19999 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


import re
def normalize(text):
    text  = text.lower()
    text = re.sub('[^\w\s]', '', text)
    return text

# test normalize function
normalize("Hello! How are you?")

'hello how are you'


jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)
jeopardy['clean_question'].head(5)

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object


jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize)
jeopardy['clean_answer'].head(5)

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object


def normalize_value(value):
    value = re.sub('[^\w\s]', '', value)
    try:
        value_int = int(value)
    except ValueError:
        value_int = 0
    return value_int
# test
normalize_value('$200')

200


#apply normalize_value function to Value column
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_value)


jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])


jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Show Number     19999 non-null  int64         
 1   Air Date        19999 non-null  datetime64[ns]
 2   Round           19999 non-null  object        
 3   Category        19999 non-null  object        
 4   Value           19999 non-null  object        
 5   Question        19999 non-null  object        
 6   Answer          19999 non-null  object        
 7   clean_question  19999 non-null  object        
 8   clean_answer    19999 non-null  object        
 9   clean_value     19999 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 1.5+ MB


def count_matches_ratio(row):
    answer = row['clean_answer']
    question = row['clean_question']
    split_answer = answer.split()
    split_question = question.split()
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count/len(split_answer)
  
# use apply() to loop over all the rows 
jeopardy['answer_in_question'] = jeopardy.apply(count_matches_ratio, axis = 1)  
jeopardy['answer_in_question'].mean()

0.05900196524977763


question_overlap = []
# get unique set of words
terms_used = set()
# sorted date, then it is clear to see what is a new question
jeopardy.sort_values('Air Date', inplace = True)
# loop over data frame with index cout
for i, row in jeopardy.iterrows():
    # get list of the words in a question
    split_question = row['clean_question'].split()
    # word contains 6+ characters
    split_question = [q for q in split_question if len(q)>= 6]
    match_count = 0
    for term in split_question:
        if term in terms_used:
            match_count += 1
        terms_used.add(term)
    if len(split_question) > 0:
        # normalize the count across different question length
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
# get the percentage of the repeated question
jeopardy['question_overlap'].mean()

0.689481997219586


def categorize_value(row):
    value = 0
    if row['clean_value'] > 800:
        value = 1
    return value

jeopardy['high_value'] = jeopardy.apply(categorize_value, axis = 1)


def count_values(word):
    low_count = 0
    high_count = 0
    for _, row in jeopardy.iterrows():
        split_question = row['clean_question'].split()
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count


#Randomly pick ten elements of terms_used
from random import choice
comparison_terms = [choice(list(terms_used)) for _ in range(10)]
comparison_terms

['recruits',
 'hotshot',
 '500000member',
 'exceptions',
 'dipsomaniac',
 'tylenol',
 'letters',
 'latvia',
 'bergens',
 'strangely']


observed_expected = []
for word in comparison_terms:
    observed_expected.append(count_values(word))
observed_expected

[(1, 1),
 (1, 0),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (17, 37),
 (3, 1),
 (0, 1),
 (1, 2)]


high_value_count = sum(jeopardy['high_value'])
low_value_count = jeopardy[jeopardy['high_value'] == 0]['high_value'].count()
# low_value_count2 = jeopardy['high_value'].count() - sum(jeopardy['high_value'])

print('high_value_count = {}'.format(high_value_count))
print('low_value_count = {}'.format(low_value_count))

high_value_count = 5734
low_value_count = 14265


import numpy as np
from scipy.stats import chisquare

chi_squared = []
for high_count, low_count in observed_expected:
    # total times that a word shows
    total = high_count + low_count
    # the probability of a word shows in jeopardy
    total_prop = total/jeopardy.shape[0]
    # expected values according to the ratio of total high/low values
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([high_count, low_count])
    expected = np.array([high_value_exp, low_value_exp])
    
    chi_squared.append(chisquare(observed, expected))
    
chi_squared

[Power_divergenceResult(statistic=0.4448774816612795, pvalue=0.5047776487545996),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.20850107809730017, pvalue=0.6479447887525934),
 Power_divergenceResult(statistic=4.198022975221989, pvalue=0.0404711362009595),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.03188116723440362, pvalue=0.8582887163235293)]


len(terms_used)

24470


from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
for word in stop_words:
    if word in terms_used:
        terms_used.remove(word)
len(terms_used)

24454


terms_used_lr = pd.Series(list(terms_used))
# The tilde (~) operator is used to invert the boolean values 
terms_used_lr = terms_used_lr[~terms_used_lr.str.contains('hrefhttp')]
len(terms_used_lr)

23251


def count_values_faster(word):
    high_count = 0
    low_count = 0
    
    # regex pattern to match the whole word only
    pattern = r"\b{}\b".format(word)
    high_count = jeopardy[(jeopardy['clean_question'].str.contains(pattern, regex = True)) &
                         (jeopardy['high_value'] == 1)]['high_value'].count()
    low_count = jeopardy[(jeopardy['clean_question'].str.contains(pattern, regex = True)) &
                        (jeopardy['high_value'] == 0)]['high_value'].count()
    return word, high_count, low_count


observed_test = []
for word in comparison_terms:
    observed_test.append(count_values_faster(word))
print(observed_test)

[('recruits', 1, 1), ('hotshot', 1, 0), ('500000member', 0, 1), ('exceptions', 0, 1), ('dipsomaniac', 0, 1), ('tylenol', 0, 1), ('letters', 17, 37), ('latvia', 3, 1), ('bergens', 0, 1), ('strangely', 1, 2)]


frequencies = terms_used_lr.apply(count_values_faster)
frequencies

0             (boasts, 5, 6)
1          (integrity, 1, 0)
2            (puberty, 0, 1)
3            (gosling, 2, 0)
4             (seward, 0, 2)
                ...         
24449      (beatified, 0, 1)
24450         (boxers, 0, 1)
24451    (modernqueen, 0, 1)
24452     (arthropods, 1, 0)
24453        (waldorf, 1, 1)
Length: 23251, dtype: object


def get_high_frequecies(data, size):
    frequencies = pd.DataFrame(data, 
                               columns = ['word', 'high_value', 'low_value'])
    frequencies['total_value'] = frequencies['high_value'] + frequencies['low_value']
    frequencies.sort_values('total_value', ascending = False, inplace = True)
    return(frequencies.head(size))



high_frequecies = get_high_frequecies(list(frequencies),1000)
high_frequecies


def calculate_chi_squared(row):
    chi_squared = []
    total_prop = row['total_value']/jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    observed = np.array([row['high_value'], row['low_value']])
    expected = np.array([high_value_exp, low_value_exp])
    
    chi_value, p_value = chisquare(observed, expected)
    
    chi_squared.append((row['word'], chi_value, p_value, row['high_value'], row['low_value']))
    return chi_squared
                         
    
chi_squared = high_frequecies.apply(calculate_chi_squared, axis = 1)
chi_squared.head(5)

867      [(called, 4.048305063534577, 0.044215717944225...
2728     [(country, 0.29967829483482744, 0.584084171311...
19362    [(played, 0.5810990283039111, 0.44588185909193...
8683     [(became, 0.05956570730840162, 0.8071836789959...
4831     [(american, 0.4938111242657224, 0.482232156839...
dtype: object


x = [c[0] for c in chi_squared]
chi_squared_df = pd.DataFrame([c[0] for c in chi_squared], 
                              columns = ['word', 'chi_squared', 'p_value', 'high_value', 'low_value'])
chi_squared_df = chi_squared_df.sort_values('p_value')
chi_squared_df = chi_squared_df[(chi_squared_df['p_value'] < 0.05) & 
                                (chi_squared_df['high_value'] > chi_squared_df['low_value']) ]
chi_squared_df


chi_squared_df.shape[0]

36

	Show Number	Air Date	Round	Category	Value	Question	Answer
0	4680	2004-12-31	Jeopardy!	HISTORY	$200	For the last 8 years of his life, Galileo was ...	Copernicus
1	4680	2004-12-31	Jeopardy!	ESPN's TOP 10 ALL-TIME ATHLETES	$200	No. 2: 1912 Olympian; football star at Carlisl...	Jim Thorpe
2	4680	2004-12-31	Jeopardy!	EVERYBODY TALKS ABOUT IT...	$200	The city of Yuma in this state has a record av...	Arizona
3	4680	2004-12-31	Jeopardy!	THE COMPANY LINE	$200	In 1963, live on "The Art Linkletter Show", th...	McDonald's
4	4680	2004-12-31	Jeopardy!	EPITAPHS & TRIBUTES	$200	Signer of the Dec. of Indep., framer of the Co...	John Adams

	word	chi_squared	p_value	high_value	low_value
179	monitora	45.947439	1.214686e-11	35	13
78	target_blanksarah	24.358972	7.995351e-07	40	33
226	target_blankkelly	20.921282	4.785483e-06	25	16
93	african	17.283572	3.219584e-05	35	33
494	painter	16.941684	3.854581e-05	16	8
159	target_blankjimmy	16.114608	5.962236e-05	28	24
217	target_blankjon	13.979777	1.847876e-04	23	19
498	pulitzer	13.429676	2.476749e-04	15	9
388	liquid	12.719123	3.619354e-04	17	12
467	example	11.997980	5.325823e-04	15	10
592	spirit	11.341071	7.581159e-04	13	8
689	andrew	11.049381	8.871680e-04	12	7
557	plants	9.954357	1.604691e-03	13	9
547	relative	9.954357	1.604691e-03	13	9
309	border	9.792563	1.752191e-03	18	16
422	process	9.542069	2.008152e-03	15	12
991	physics	8.682874	3.212141e-03	9	5
947	spiritual	8.682874	3.212141e-03	9	5
439	string	8.057304	4.532057e-03	14	12
885	elements	7.198788	7.295282e-03	9	6
625	marine	6.779092	9.223181e-03	11	9
461	jersey	6.652781	9.900120e-03	13	12
721	greece	6.361380	1.166308e-02	10	8
829	translated	5.950459	1.471346e-02	9	7
848	window	5.950459	1.471346e-02	9	7
861	filled	5.950459	1.471346e-02	9	7
590	composed	5.772340	1.628035e-02	11	10
618	persian	5.772340	1.628035e-02	11	10
584	particles	5.772340	1.628035e-02	11	10
602	colony	5.772340	1.628035e-02	11	10
992	physicist	5.549240	1.848871e-02	8	6
694	freedom	5.333590	2.091826e-02	10	9
788	committee	4.896281	2.691459e-02	9	8
792	portuguese	4.896281	2.691459e-02	9	8
919	describe	4.460992	3.467736e-02	8	7
907	nature	4.460992	3.467736e-02	8	7

Hypothesis Testing: Winning Jeopardy¶

Normalize the columns¶

Study¶

Repeated questions¶

Study questions with high value¶

Eliminate non-informative words¶

Remove stopwords¶

Remove hrefhttp¶

Re-write count_values function¶

Words with higher frequencies¶

Conclusion¶

	word	high_value	low_value	total_value
867	called	168	346	514
2728	country	141	332	473
19362	played	77	212	289
8683	became	79	203	282
4831	american	77	174	251
...	...	...	...	...
17472	controversial	4	10	14
5216	consists	3	11	14
5286	stopped	1	13	14
16451	figures	4	10	14
16267	waterfall	5	9	14