Suppose $X^* \in\{0,1\}$ is a binary latent variable (Suppose that researchers only know $X^*$ is a discrete variable and have no knowledge about its support). $X$, $Y$, and $Z$ are observed variables that are dependent on $X^*$, where $X$ and $Z$ are continuous and $Y$ is binary. $X$, $Y$, and $Z$ are mutually independent conditional on $X^*$.
$\operatorname{Pr}\left(Y \mid X^*\right)$ can be consistently estimated using the methodology in Hu (2008), which is shown below through Monte Carlo experiments.
import numpy as np
from numpy.linalg import inv, eig, matrix_rank
from numpy.random import uniform as unif
from numpy.random import standard_normal as norm
Generate a sample of $\left(X_i, Z_i, Y_i\right), i=1,2, \cdots, N$ using the DGP: $$ \begin{aligned} &\operatorname{Pr}\left(X^*=0\right)=0.4, \\ &\operatorname{Pr}\left(Y=1 \mid X^*=1\right)=0.7, \operatorname{Pr}\left(Y=1 \mid X^*=0\right)=0.4, \\ &X=2 X^*+\eta, \\ &Z=X^*+\xi, \end{aligned} $$ where $\eta$ and $\xi$ are two independent standard norm distributions.
def gen_sample(n):
# n: sample size
# Pr(X*=1) = 0.6
Xstar = (unif(size=n) > 0.4) * 1.0
# Pr(Y=1|X*=1) = 0.7; Pr(Y=1|X*=0) = 0.4
Y = (unif(size=n) > 0.6 - Xstar*0.3) * 1.0
# X = 2 * X* + eta
X = 2 * Xstar + norm(size=n)
# Z = X* + xi
Z = Xstar + norm(size=n)
return Xstar, Y, X, Z
Discretize $X$ and $Z$.
def discretize(array, nd):
# array: the array to discretize; nd: # of bins
# equal probability
bins = [np.percentile(array, (100/nd)*i) for i in range(1, nd)]
return np.digitize(array, bins)
Use the eigenvalue-eigenvector decomposition to estimate $\operatorname{Pr}\left(Y \mid X^*\right)$ and correctly order them -- it is assumed that $\operatorname{Pr}\left(Y=1 \mid X^*=1\right)>$ $\operatorname{Pr}\left(Y=1 \mid X^*=0\right)$.
def estimate(Y, Xd, Zd, nd):
# prob matrices
M_YXZ = np.zeros((nd, nd))
M_XZ = np.zeros((nd, nd))
for i in range(nd):
for j in range(nd):
M_YXZ[i, j] = ((Y==1)&(Xd==i)&(Zd==j)).mean()
M_XZ[i, j] = ((Xd==i)&(Zd==j)).mean()
# ensure rank(M_XZ) >= # of values of X*
assert(matrix_rank(M_XZ)) == nd
# eigendecomposition
Lambda, B = eig(M_YXZ@inv(M_XZ))
# re-ranking & normalization
order = np.argsort(Lambda)
B = (B / B.sum(0, keepdims=True))[order]
Lambda = Lambda[order]
return Lambda, B
Test algorithm performances with different # of draws
nd = 2 # number of bins
repeat = 100 # number of simulations
Y_Xstar = np.array([0.4, 0.7]) # truth
for n_ in [1e3, 1e4, 1e5, 1e6]:
n = int(n_)
record = []
for r in range(repeat):
Xstar, Y, X, Z = gen_sample(n)
Xd = discretize(X, nd)
Zd = discretize(Z, nd)
Lambda, _ = estimate(Y, Xd, Zd, nd)
record.append(Lambda-Y_Xstar)
print(f'# of draws: {n};', end='\t')
std = np.array(record).std(axis=0)
for i in range(nd):
print(f'std Pr[Y=1|X*={i}]: {round(std[i], 8)};', end='\n' if i else '\t')
Recover the true value.
nd = 2 # number of bins
repeat = 200 # number of simulations
n = int(1e6) # number of draws for each simulation
record = []
for r in range(repeat):
Xstar, Y, X, Z = gen_sample(n)
Xd = discretize(X, nd)
Zd = discretize(Z, nd)
Lambda, _ = estimate(Y, Xd, Zd, nd)
record.append(Lambda)
results = np.array(record)
mean = results.mean(axis=0)
std = results.std(axis=0)
for i in range(nd):
print(f'Pr[Y=1|X*={i}] = {round(mean[i], 8)}; std = {round(std[i], 8)}')