# Table II and Table X of the online appendix of https://doi.org/10.1093/qje/qjaa005

import math
import numpy as np
import scipy.stats

########## DATA ##########

table10a = np.array([
  [0.277, 0.278, 0.198, 0.139, 0.108], #  400 -  490
  [0.252, 0.263, 0.200, 0.168, 0.118], #  500 -  590
  [0.207, 0.242, 0.209, 0.195, 0.148], #  600 -  690
  [0.168, 0.211, 0.213, 0.223, 0.186], #  700 -  790
  [0.122, 0.172, 0.211, 0.252, 0.242], #  800 -  890
  [0.088, 0.141, 0.202, 0.269, 0.300], #  900 -  990
  [0.066, 0.116, 0.188, 0.272, 0.358], # 1000 - 1090
  [0.050, 0.099, 0.173, 0.267, 0.411], # 1100 - 1190
  [0.040, 0.082, 0.153, 0.251, 0.474], # 1200 - 1290
  [0.033, 0.069, 0.135, 0.231, 0.532], # 1300 - 1390
  [0.028, 0.059, 0.118, 0.209, 0.586], # 1400 - 1490
  [0.022, 0.044, 0.084, 0.168, 0.681]  # 1500 - 1600
])

for row in table10a:
  assert sum(row)-1 < 0.002

table10b = np.array([0.001, 0.007, 0.022, 0.060, 0.118, 0.182, 0.203, 0.184, 0.121, 0.065, 0.029, 0.008])

for i in range(len(table10a)):
  table10a[i] = table10a[i] * table10b[i]

########## FUNCTIONS ##########

def bucketFromSAT(sat):
  if sat < 495:
    return 0
  elif sat < 595:
    return 1
  elif sat < 695:
    return 2
  elif sat < 795:
    return 3
  elif sat < 895:
    return 4
  elif sat < 995:
    return 5
  elif sat < 1095:
    return 6
  elif sat < 1195:
    return 7
  elif sat < 1295:
    return 8
  elif sat < 1395:
    return 9
  elif sat < 1495:
    return 10
  else:
    return 11

# thresholds chosen to match table10a
def bucketFromIncomePercentile(percentile):
  if percentile < 0.07765799999999998:
    return 0
  elif percentile < 0.201953:
    return 1
  elif percentile < 0.383969:
    return 2
  elif percentile < 0.638783:
    return 3
  else:
    return 4

########## SIMULATION ##########

N = 100000

normalizedSAT = np.random.normal(0, 1, N)
noise = np.random.normal(0, 1, N)

for it in range(20):
  r = 0.30 + (it - 10) / 100.0
  normalizedIncome = r * normalizedSAT + math.sqrt(1 - r*r) * noise # parental income

  # I chose the mean and sd to minimize square error of satBucket and table10b
  # It's not that far off from the true distribution: https://nces.ed.gov/programs/digest/d20/tables/dt20_226.40.asp
  sat = np.clip(1047 + 192 * normalizedSAT, 200, 1600)
  incomePercentile = np.array([scipy.stats.norm.cdf(x) for x in normalizedIncome])
  satBucket = np.array([bucketFromSAT(s) for s in sat])
  incomeBucket = np.array([bucketFromIncomePercentile(i) for i in incomePercentile])
  counts = np.zeros(table10a.shape)
  for i in range(len(satBucket)):
    counts[satBucket[i]][incomeBucket[i]] += 1
  counts = counts / len(satBucket)
  squareError = sum(sum(np.square(counts - table10a)))
  print(r, squareError)

# r=0.3 is best