HW8:
1.1:
def one_resampled_percentage(tbl):
bootstrap_sample = tbl.sample(with_replacement=True)
imm_votes = bootstrap_sample.where("Vote", "Imm Thai").num_rows
total_votes = bootstrap_sample.num_rows
imm_percentage = (imm_votes / total_votes) * 100
return imm_percentage
one_resampled_percentage(votes)
1.2:
def percentages_in_resamples():
percentage_imm = make_array()
for i in range(2023):
percentage_imm = np.append(percentage_imm,
one_resampled_percentage(votes))
return percentage_imm
1.3:
imm_lower_bound = np.percentile(resampled_percentages, 2.5)
imm_upper_bound = np.percentile(resampled_percentages, 97.5)
print(f"Bootstrapped 95% confidence interval for the percentage of Imm Thai
voters in the population: [{imm_lower_bound:.2f},
{imm_upper_bound:.2f}]")
1.4:
def one_resampled_difference(tbl):
bootstrap = tbl.sample(with_replacement=True)
imm_percentage = bootstrap.where("Vote", "Imm Thai").num_rows /
bootstrap.num_rows * 100
lead = imm_percentage - (100 - imm_percentage)
return lead
1.6:
diff_lower_bound = np.percentile(sampled_leads, 2.5)
diff_upper_bound = np.percentile(sampled_leads, 97.5)
print("Bootstrapped 95% confidence interval for Imm Thai's true lead over
Lucky House, Thai Temple, and Thai Basil combined: [{:f}%, {:f}
%]".format(diff_lower_bound, diff_upper_bound))
2.2:
true_percentage_intervals = 6000 * 0.95
HW9:
# Question 2.1
smallest = 1112
smallest
# Question 2.2
grader.check("q2_1")
# Question 2.3
smallest_num = 767
smallest_num
# Question 2.4
grader.check("q2_3")
# Question 2.5
option = 1
option
# Question 2.6
grader.check("q2_5")
# Question 3.1
sample = Table().with_columns(
"Vote", make_array("Yes", "No"),
"Count", make_array(210, 190))
sample_size = sum(sample.column("Count"))
sample_with_proportions = sample.with_column("Proportion",
sample.column("Count") / sample_size)
sample_with_proportions
# Question 3.2
resample_yes_proportions = make_array()
for i in np.arange(10000):
resample = sample_proportions(400, [0.525, 0.475])
resample_yes_proportions = np.append(resample_yes_proportions,
resample.item(0))
Table().with_column("Resample Yes proportion",
resample_yes_proportions).hist(bins=np.arange(.2, .8, .01))
resample_yes_proportions
# Question 3.3
grader.check("q3_1")
# Question 3.4
approx_pop_sd = np.sqrt(0.525 * 0.475)
approximate_sd = approx_pop_sd / np.sqrt(400)
approximate_sd
# Question 3.5
grader.check("q3_3")
# Question 3.6
exact_sd = np.std(resample_yes_proportions)
exact_sd
# Question 3.7
grader.check("q3_4")
# Question 3.8
lower_limit = np.mean(resample_yes_proportions) - 1.96 * approximate_sd
upper_limit = np.mean(resample_yes_proportions) + 1.96 * approximate_sd
print('lower:', lower_limit, 'upper:', upper_limit)
# Question 3.9
grader.check("q3_5")
# Question 3.10
estimated_population_sd = np.sqrt(0.525 * 0.475)
ella_sample_size = 9975
ella_sample_mean_sd = estimated_population_sd / np.sqrt(ella_sample_size)
print("With Ella's sample size, you would predict a sample mean SD of %f." %
ella_sample_mean_sd)
# Question 3.11
grader.check("q3_6")
# Question 3.12
smaller_sample_size = 4000
smaller_sample_mean_sd = estimated_population_sd /
np.sqrt(smaller_sample_size)
print("With this smaller sample size, you would predict a sample mean SD of
%f" % smaller_sample_mean_sd)
# Question 3.13
grader.check("q3_7")
# Question 3.14
larger_sample_size = 11000
larger_sample_mean_sd = estimated_population_sd /
np.sqrt(larger_sample_size)
print("With this larger sample size, you would predict a sample mean SD of
%f" % larger_sample_mean_sd)
# Question 3.15
grader.check("q3_8")
# Question 3.16
min_sufficient = False
min_sufficient
# Question 3.17
grader.check("q3_9")
# Final Submission
# Save your notebook first, then run this cell to export your submission.
grader.export(pdf=False, run_tests=True)
HW10:
1.1:
def standard_units(data):
"""Converts data to standard units."""
return (data - np.mean(data)) / np.std(data)
1.2:
standard_array = make_array(2,3,4,5)
1.3:
def correlation(x, y):
"""Computes the correlation between two arrays."""
return np.mean(standard_units(x) * standard_units(y))
1.4:
r_array = make_array(1, 2, 3, 4)
1.5:
def slope(x, y):
r = correlation(x, y)
return r * np.std(y) / np.std(x)
1.6:
slope_array = make_array(2, 4, 5)
1.7:
def intercept(x, y):
"""Computes the intercept of the regression line."""
return np.mean(y) - slope(x, y) * np.mean(x)
1.8:
intercept_array = make_array(1, 4)
1.9:
def predict(tbl, col1, col2):
x = tbl.column(col1)
y = tbl.column(col2)
return slope(x, y) * x + intercept(x, y)
2.2:
r_guess = -0.75
2.7:
def rmse(slope, intercept):
predictions = slope * ages + intercept
errors = predictions - values
squared_errors = errors ** 2
mse = np.mean(squared_errors)
return np.sqrt(mse)
2.10:
error_array = make_array(2, 4)
2.11:
scoring_array = make_array(3)
HW11:
# Question 0.3
secret_word = 'abc'
# Question 0.4
grader.check("q0_1")
# Question 1.1
birds = Table.read_table('snowy_plover.csv')
birds
# Question 1.2
# Just run this cell and examine the scatter plot.
birds.scatter('Egg Weight', "Bird Weight", fit_line=True)
# Question 1.3
def standard_units(arr):
return (arr - np.mean(arr)) / np.std(arr)
def correlation(tbl, x_col, y_col):
return np.mean(standard_units(tbl.column(x_col)) *
standard_units(tbl.column(y_col)))
# Question 1.4
def fit_line(tbl, x_col, y_col):
slope = correlation(tbl, x_col, y_col) * np.std(tbl.column(y_col)) /
np.std(tbl.column(x_col))
intercept = np.mean(tbl.column(y_col)) - slope *
np.mean(tbl.column(x_col))
return make_array(slope, intercept)
fit_line(birds, "Egg Weight", "Bird Weight")
# Question 1.5
resampled_slopes = make_array()
for i in np.arange(1000):
birds_bootstrap = birds.sample(with_replacement=True)
bootstrap_line = fit_line(birds, 'Egg Weight', 'Bird Weight')
bootstrap_slope = bootstrap_line.item(0)
resampled_slopes = np.append(resampled_slopes, bootstrap_slope)
# Question 1.6
lower_end = np.percentile(resampled_slopes, 2.5)
upper_end = np.percentile(resampled_slopes, 97.5)
print("95% confidence interval for slope: [{:g}, {:g}]".format(lower_end,
upper_end))
# Question 2.1
def fitted_value(table, x_col, y_col, given_x):
line = fit_line(table, x_col, y_col)
slope = line.item(0)
intercept = line.item(1)
return slope * given_x + intercept
egg_weight_eight = fitted_value(birds, "Egg Weight", "Bird Weight", 8)
egg_weight_eight
grader.check("q2_1")
# Question 2.2
experts_egg = fitted_value(birds, 'Egg Weight', 'Bird Weight', 9)
experts_egg
grader.check("q2_2")
# Question 2.3
def compute_resampled_line(tbl, x_col, y_col):
resample = tbl.sample(with_replacement=True)
resampled_line = fit_line(resample, x_col, y_col)
return resampled_line
grader.check("q2_3")
# Question 2.4
predictions_for_eight = regression_lines.column('Slope') * 8 +
regression_lines.column('Intercept')
# This will make a histogram of your predictions:
table_of_predictions = Table().with_column('Predictions at Egg Weight=8',
predictions_for_eight)
table_of_predictions.hist('Predictions at Egg Weight=8', bins=20)
grader.check("q2_4")
# Question 2.5
lower_bound = np.percentile(predictions_for_eight, 2.5)
upper_bound = np.percentile(predictions_for_eight, 97.5)
print('95% Confidence interval for predictions for x=8: (', lower_bound, ",",
upper_bound, ')')
grader.check("q2_5")
# Question 2.6
plover_statements = make_array(1)
grader.check("q2_6")
HW12:
1.1.1:
def distance(arr1, arr2):
return np.sqrt(np.sum((arr1 - arr2) ** 2))
1.2:
shuffled_table = coordinates.sample(with_replacement=False) # Shuffle the
table
train = shuffled_table.take(np.arange(0, 75))
test = shuffled_table.take(np.arange(75, 100))
print("Training set:\t", train.num_rows, "examples")
print("Test set:\t", test.num_rows, "examples")
train.show(5), test.show(5);
1.3:
features = make_array("longitude", "latitude")
features
1.4:
def row_to_array(row, features):
arr = make_array()
for feature in features:
arr = np.append(arr, row.item(feature))
return arr
def classify(test_row, k, train):
test_row_features_array = row_to_array(test_row, features)
distances = make_array()
for train_row in train.rows:
train_row_features_array = row_to_array(train_row, features)
row_distance = distance(test_row_features_array,
train_row_features_array)
distances = np.append(distances, row_distance)
train_with_distances = train.with_column("distance", distances)
nearest_neighbors =
train_with_distances.sort("distance").take(np.arange(k))
most_common_label = nearest_neighbors.group('school').sort('count',
descending=True).column('school').item(0)
return most_common_label
# Don't modify/delete the code below
first_test = classify(test.row(0), 5, train)
first_test
1.5:
def three_classify(row):
return classify(row, 3, train)
test_with_prediction = test.with_column("prediction",
test.apply(three_classify))
labels_correct = np.count_nonzero(test_with_prediction.column("school") ==
test_with_prediction.column("prediction"))
accuracy = labels_correct / test.num_rows
accuracy
1.9.1:
prob_test_given_stanford = 0.5
prob_stanford = 23 / 100
prob_test_given_berkeley = 0.2
prob_berkeley = 77 / 100
prob_test = (prob_test_given_stanford * prob_stanford) +
(prob_test_given_berkeley * prob_berkeley)
prob_furd = (prob_test_given_stanford * prob_stanford) / prob_test
1.9.2: prob_test=0.5