# Table II and Table X of the online appendix of https://doi.org/10.1093/qje/qjaa005 import math import numpy as np import scipy.stats ########## DATA ########## table10a = np.array([ [0.277, 0.278, 0.198, 0.139, 0.108], # 400 - 490 [0.252, 0.263, 0.200, 0.168, 0.118], # 500 - 590 [0.207, 0.242, 0.209, 0.195, 0.148], # 600 - 690 [0.168, 0.211, 0.213, 0.223, 0.186], # 700 - 790 [0.122, 0.172, 0.211, 0.252, 0.242], # 800 - 890 [0.088, 0.141, 0.202, 0.269, 0.300], # 900 - 990 [0.066, 0.116, 0.188, 0.272, 0.358], # 1000 - 1090 [0.050, 0.099, 0.173, 0.267, 0.411], # 1100 - 1190 [0.040, 0.082, 0.153, 0.251, 0.474], # 1200 - 1290 [0.033, 0.069, 0.135, 0.231, 0.532], # 1300 - 1390 [0.028, 0.059, 0.118, 0.209, 0.586], # 1400 - 1490 [0.022, 0.044, 0.084, 0.168, 0.681] # 1500 - 1600 ]) for row in table10a: assert sum(row)-1 < 0.002 table10b = np.array([0.001, 0.007, 0.022, 0.060, 0.118, 0.182, 0.203, 0.184, 0.121, 0.065, 0.029, 0.008]) for i in range(len(table10a)): table10a[i] = table10a[i] * table10b[i] ########## FUNCTIONS ########## def bucketFromSAT(sat): if sat < 495: return 0 elif sat < 595: return 1 elif sat < 695: return 2 elif sat < 795: return 3 elif sat < 895: return 4 elif sat < 995: return 5 elif sat < 1095: return 6 elif sat < 1195: return 7 elif sat < 1295: return 8 elif sat < 1395: return 9 elif sat < 1495: return 10 else: return 11 # thresholds chosen to match table10a def bucketFromIncomePercentile(percentile): if percentile < 0.07765799999999998: return 0 elif percentile < 0.201953: return 1 elif percentile < 0.383969: return 2 elif percentile < 0.638783: return 3 else: return 4 ########## SIMULATION ########## N = 100000 normalizedSAT = np.random.normal(0, 1, N) noise = np.random.normal(0, 1, N) for it in range(20): r = 0.30 + (it - 10) / 100.0 normalizedIncome = r * normalizedSAT + math.sqrt(1 - r*r) * noise # parental income # I chose the mean and sd to minimize square error of satBucket and table10b # It's not that far off from the true distribution: https://nces.ed.gov/programs/digest/d20/tables/dt20_226.40.asp sat = np.clip(1047 + 192 * normalizedSAT, 200, 1600) incomePercentile = np.array([scipy.stats.norm.cdf(x) for x in normalizedIncome]) satBucket = np.array([bucketFromSAT(s) for s in sat]) incomeBucket = np.array([bucketFromIncomePercentile(i) for i in incomePercentile]) counts = np.zeros(table10a.shape) for i in range(len(satBucket)): counts[satBucket[i]][incomeBucket[i]] += 1 counts = counts / len(satBucket) squareError = sum(sum(np.square(counts - table10a))) print(r, squareError) # r=0.3 is best