PCA From Scratch

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-talk')
from sklearn.datasets import load_iris
from scipy import linalg
In [2]:
iris = load_iris()
iris.data[:5]
Out[2]:
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])
In [3]:
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(iris.data)
X_scaled[:5]
Out[3]:
array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])
In [6]:
from sklearn.decomposition import PCA

X_pca2 = PCA(n_components=2).fit_transform(X_scaled)
fig, ax = plt.subplots(figsize=(12, 9))
ax.scatter(X_pca2[:, 0], X_pca2[:, 1], c=iris.target)
X_pca2[:5]
Out[6]:
array([[-2.26470281,  0.4800266 ],
       [-2.08096115, -0.67413356],
       [-2.36422905, -0.34190802],
       [-2.29938422, -0.59739451],
       [-2.38984217,  0.64683538]])
In [ ]:
features = X_scaled.T
cov_matrix = np.cov(features)
cov_matrix[:5]
In [ ]:
values, vectors = np.linalg.eig(cov_matrix)
values[:5]
In [ ]:
explained_variances = []
for i in range(len(values)):
    explained_variances.append(values[i] / np.sum(values))

print(np.sum(explained_variances), \n, explained_variances)
In [ ]:
projected_1 = X_scaled.dot(vectors.T[0])
projected_2 = X_scaled.dot(vectors.T[1])res = pd.DataFrame(projected_1, columns=[PC1])
res[PC2] = projected_2
res[Y] = y
res.head()
In [ ]:
import matplotlib.pyplot as plt
import seaborn as snsplt.figure(figsize=(20, 10))
sns.scatterplot(res[PC1], [0] * len(res), hue=res[Y], s=200)
In [ ]:
plt.figure(figsize=(20, 10))
sns.scatterplot(res[PC1], [0] * len(res), hue=res[Y], s=100)
In [50]:
pca = PCA()
pca.fit_transform(iris.data)
pca.components_, pca.explained_variance_ratio_
Out[50]:
(array([[ 0.36138659, -0.08452251,  0.85667061,  0.3582892 ],
        [ 0.65658877,  0.73016143, -0.17337266, -0.07548102],
        [-0.58202985,  0.59791083,  0.07623608,  0.54583143],
        [-0.31548719,  0.3197231 ,  0.47983899, -0.75365743]]),
 array([0.92461872, 0.05306648, 0.01710261, 0.00521218]))
In [15]:
M = iris.data.T@iris.data
M
Out[15]:
array([[5223.85, 2673.43, 3483.76, 1128.14],
       [2673.43, 1430.4 , 1674.3 ,  531.89],
       [3483.76, 1674.3 , 2582.71,  869.11],
       [1128.14,  531.89,  869.11,  302.33]])
In [59]:
S, V, D = linalg.svd(M, full_matrices=False)
S, V, D
#S.shape, V.shape, D.shape
#S@np.eye(len(V))*V@D
Out[59]:
(array([[-0.75110816, -0.2841749 ,  0.50215472,  0.32081425],
        [-0.38008617, -0.5467445 , -0.67524332, -0.31725607],
        [-0.51300886,  0.70866455, -0.05916621, -0.48074507],
        [-0.16790754,  0.34367081, -0.53701625,  0.75187165]]),
 array([9.20830507e+03, 3.15454317e+02, 1.19780429e+01, 3.55257020e+00]),
 array([[-0.75110816, -0.38008617, -0.51300886, -0.16790754],
        [-0.2841749 , -0.5467445 ,  0.70866455,  0.34367081],
        [ 0.50215472, -0.67524332, -0.05916621, -0.53701625],
        [ 0.32081425, -0.31725607, -0.48074507,  0.75187165]]))
In [16]:
P, L, U = linalg.lu(M)
print(P)
print(L)
print(U)
[[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]]
[[ 1.          0.          0.          0.        ]
 [ 0.66689511  1.          0.          0.        ]
 [ 0.51177388 -0.57283468  1.          0.        ]
 [ 0.21595949  0.41863429  0.20405077  1.        ]]
[[5223.85       2673.43       3483.76       1128.14      ]
 [   0.         -108.5973835   259.40750517  116.758955  ]
 [   0.            0.           40.00023144   21.42098986]
 [   0.            0.            0.            5.44718522]]
In [17]:
L@U
Out[17]:
array([[5223.85, 2673.43, 3483.76, 1128.14],
       [3483.76, 1674.3 , 2582.71,  869.11],
       [2673.43, 1430.4 , 1674.3 ,  531.89],
       [1128.14,  531.89,  869.11,  302.33]])
In [32]:
D = np.diag(U)
U_ = U / D[:, np.newaxis]
D = np.eye(len(D))*D
L@D@U_
Out[32]:
array([[5223.85, 2673.43, 3483.76, 1128.14],
       [3483.76, 1674.3 , 2582.71,  869.11],
       [2673.43, 1430.4 , 1674.3 ,  531.89],
       [1128.14,  531.89,  869.11,  302.33]])
In [40]:
L_norm = np.sqrt((L**2).sum(axis=1))[:, np.newaxis]
L_norm
Out[40]:
array([[1.        ],
       [1.20197716],
       [1.26097267],
       [1.12406845]])
In [76]:
X = iris.data - iris.data.mean(axis=0)
(X.T@X) / np.cov(X.T)
Out[76]:
array([[149., 149., 149., 149.],
       [149., 149., 149., 149.],
       [149., 149., 149., 149.],
       [149., 149., 149., 149.]])
In [78]:
linalg.eig(np.cov(X.T))
Out[78]:
(array([4.22824171+0.j, 0.24267075+0.j, 0.0782095 +0.j, 0.02383509+0.j]),
 array([[ 0.36138659, -0.65658877, -0.58202985,  0.31548719],
        [-0.08452251, -0.73016143,  0.59791083, -0.3197231 ],
        [ 0.85667061,  0.17337266,  0.07623608, -0.47983899],
        [ 0.3582892 ,  0.07548102,  0.54583143,  0.75365743]]))
In [80]:
val, vec = linalg.eig(X.T@X)
val, vec
Out[80]:
(array([630.0080142 +0.j,  36.15794144+0.j,  11.65321551+0.j,
          3.55142885+0.j]),
 array([[ 0.36138659, -0.65658877, -0.58202985,  0.31548719],
        [-0.08452251, -0.73016143,  0.59791083, -0.3197231 ],
        [ 0.85667061,  0.17337266,  0.07623608, -0.47983899],
        [ 0.3582892 ,  0.07548102,  0.54583143,  0.75365743]]))
In [83]:
val / val.sum()
Out[83]:
array([0.92461872+0.j, 0.05306648+0.j, 0.01710261+0.j, 0.00521218+0.j])
In [85]:
A = X.T@X
A
Out[85]:
array([[102.16833333,  -6.32266667, 189.873     ,  76.92433333],
       [ -6.32266667,  28.30693333, -49.1188    , -18.12426667],
       [189.873     , -49.1188    , 464.3254    , 193.0458    ],
       [ 76.92433333, -18.12426667, 193.0458    ,  86.56993333]])
In [88]:
Q, R = np.linalg.qr(A)
Q@R
Out[88]:
array([[102.16833333,  -6.32266667, 189.873     ,  76.92433333],
       [ -6.32266667,  28.30693333, -49.1188    , -18.12426667],
       [189.873     , -49.1188    , 464.3254    , 193.0458    ],
       [ 76.92433333, -18.12426667, 193.0458    ,  86.56993333]])
In [98]:
B.sum(axis=0)
Out[98]:
array([630.0080142 ,  36.15794144,  11.6532155 ,   3.55142885])
In [113]:
#np.sqrt((B**2).sum(axis=0))
C = B.copy()
C[abs(C) < 0.00001] = 0
np.diag(C)
Out[113]:
array([630.0080142 ,  36.15794144,  11.65321551,   3.55142885])
In [126]:
mu = np.diag(C)[0]
mu
Out[126]:
630.0080141991947
In [127]:
b = np.random.rand(4)
for i in range(20):
    part = np.linalg.inv(A - np.eye(len(b))*mu)@b
    b = part/np.sqrt((part**2).sum())
    #if i % 10 == 0:
    #    print(b)
    print(b)
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
[-0.36138659  0.08452251 -0.85667061 -0.3582892 ]
[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
In [112]:
B / Q@R
Out[112]:
array([[ 3.96910098e+05,  3.57611171e+11,  7.07761597e+19,
        -8.20126460e+29],
       [ 2.27797929e+04,  1.30739673e+03, -1.35796775e+02,
         1.78869636e+06],
       [ 7.34161916e+03,  4.21356284e+02,  1.35797432e+02,
         1.26125756e+01],
       [-2.23742864e+03, -1.28412357e+02, -4.13855658e+01,
         1.26126469e+01]])
In [90]:
B = A.copy()
for i in range(20):
    print(B)
    Q, R = np.linalg.qr(B)
    B = R@Q
[[102.16833333  -6.32266667 189.873       76.92433333]
 [ -6.32266667  28.30693333 -49.1188     -18.12426667]
 [189.873      -49.1188     464.3254     193.0458    ]
 [ 76.92433333 -18.12426667 193.0458      86.56993333]]
[[ 6.23068798e+02 -5.71584659e+01  2.85832436e+01 -1.54553883e+00]
 [-5.71584659e+01  4.10188441e+01 -6.90732653e+00  1.69802749e-01]
 [ 2.85832436e+01 -6.90732653e+00  1.33479057e+01 -1.80820184e+00]
 [-1.54553883e+00  1.69802749e-01 -1.80820184e+00  3.93505189e+00]]
[[ 6.29986558e+02 -3.53009019e+00  5.33430043e-01  8.95454325e-03]
 [-3.53009019e+00  3.61067152e+01 -1.33215501e+00 -1.97113521e-03]
 [ 5.33430043e-01 -1.33215501e+00  1.16893741e+01  5.43436751e-01]
 [ 8.95454325e-03 -1.97113521e-03  5.43436751e-01  3.58795302e+00]]
[[ 6.30007944e+02 -2.03830773e-01  9.86992566e-03 -5.05821857e-05]
 [-2.03830773e-01  3.61505135e+01 -4.28601978e-01  1.89024943e-04]
 [ 9.86992566e-03 -4.28601978e-01  1.16573099e+01 -1.66046605e-01]
 [-5.05821857e-05  1.89024943e-04 -1.66046605e-01  3.55483253e+00]]
[[ 6.30008014e+02 -1.17057013e-02  1.82570992e-04  2.85192079e-07]
 [-1.17057013e-02  3.61571629e+01 -1.38144212e-01 -1.85703916e-05]
 [ 1.82570992e-04 -1.38144212e-01  1.16536781e+01  5.06157379e-02]
 [ 2.85192058e-07 -1.85703916e-05  5.06157379e-02  3.55174508e+00]]
[[ 6.30008014e+02 -6.71866668e-04  3.37702145e-06 -1.60771102e-09]
 [-6.71866668e-04  3.61578605e+01 -4.45225239e-02  1.82403944e-06]
 [ 3.37702146e-06 -4.45225239e-02  1.16532670e+01 -1.54259280e-02]
 [-1.60768944e-09  1.82403945e-06 -1.54259280e-02  3.55145822e+00]]
[[ 6.30008014e+02 -3.85605864e-05  6.24645645e-08  9.08434478e-12]
 [-3.85605865e-05  3.61579330e+01 -1.43490272e-02 -1.79157498e-07]
 [ 6.24645780e-08 -1.43490272e-02  1.16532212e+01  4.70120667e-03]
 [ 9.06274830e-12 -1.79157504e-07  4.70120667e-03  3.55143158e+00]]
[[ 6.30008014e+02 -2.21310260e-06  1.15538965e-09 -7.26897122e-14]
 [-2.21310264e-06  3.61579406e+01 -4.62449814e-03  1.75968287e-08]
 [ 1.15540315e-09 -4.62449814e-03  1.16532161e+01 -1.43273787e-03]
 [-5.10877798e-14  1.75968349e-08 -1.43273787e-03  3.55142911e+00]]
[[ 6.30008014e+02 -1.27016183e-07  2.13579209e-11  2.18915793e-14]
 [-1.27016228e-07  3.61579414e+01 -1.49041323e-03 -1.72835296e-09]
 [ 2.13714139e-11 -1.49041323e-03  1.16532156e+01  4.36640571e-04]
 [ 2.87987793e-16 -1.72835915e-09  4.36640571e-04  3.55142888e+00]]
[[ 6.30008014e+02 -7.28977616e-09  3.81815233e-13 -2.16057204e-14]
 [-7.28982054e-09  3.61579414e+01 -4.80340028e-04  1.69753051e-10]
 [ 3.95305594e-13 -4.80340028e-04  1.16532155e+01 -1.33070389e-04]
 [-1.62342087e-18  1.69759238e-10 -1.33070389e-04  3.55142886e+00]]
[[ 6.30008014e+02 -4.18339038e-10 -6.17758395e-15  2.16042602e-14]
 [-4.18383415e-10  3.61579414e+01 -1.54807095e-04 -1.66675461e-11]
 [ 7.31194076e-15 -1.54807095e-04  1.16532155e+01  4.05544735e-05]
 [ 9.15141331e-21 -1.66737328e-11  4.05544736e-05  3.55142885e+00]]
[[ 6.30008014e+02 -2.39678307e-11 -1.33540111e-14 -2.16042980e-14]
 [-2.40122073e-11  3.61579414e+01 -4.98922330e-05  1.63150547e-12]
 [ 1.35248472e-16 -4.98922330e-05  1.16532155e+01 -1.23593636e-05]
 [-5.15875871e-23  1.63769212e-12 -1.23593636e-05  3.55142885e+00]]
[[ 6.30008014e+02 -1.33375185e-12 -1.34866737e-14  2.16043123e-14]
 [-1.37812848e-12  3.61579414e+01 -1.60795919e-05 -1.54667285e-13]
 [ 2.50168182e-18 -1.60795919e-05  1.16532155e+01  3.76663423e-06]
 [ 2.90805261e-25 -1.60853932e-13  3.76663424e-06  3.55142885e+00]]
[[ 6.30008014e+02 -3.47180573e-14 -1.34891024e-14 -2.16043166e-14]
 [-7.90946902e-14  3.61579414e+01 -5.18223500e-06  9.61241000e-15]
 [ 4.62734389e-20 -5.18223500e-06  1.16532155e+01 -1.14791780e-06]
 [-1.63930327e-27  1.57990547e-14 -1.14791780e-06  3.55142885e+00]]
[[ 6.30008014e+02  3.98371669e-14 -1.34891393e-14  2.16043179e-14]
 [-4.53946793e-15  3.61579414e+01 -1.67016425e-06  4.63486271e-15]
 [ 8.55916661e-22 -1.67016425e-06  1.16532155e+01  3.49838924e-07]
 [ 9.24094422e-30 -1.55178134e-15  3.49838926e-07  3.55142885e+00]]
[[ 6.30008014e+02  4.41161026e-14 -1.34891375e-14 -2.16043183e-14]
 [-2.60532901e-16  3.61579414e+01 -5.38271342e-07 -6.03422806e-15]
 [ 1.58318324e-23 -5.38271351e-07  1.16532155e+01 -1.06616756e-07]
 [-5.20922833e-32  1.52415785e-16 -1.06616758e-07  3.55142885e+00]]
[[ 6.30008014e+02  4.43616829e-14 -1.34891366e-14  2.16043185e-14]
 [-1.49527199e-17  3.61579414e+01 -1.73477568e-07  6.17167353e-15]
 [ 2.92840331e-25 -1.73477577e-07  1.16532155e+01  3.24924748e-08]
 [ 2.93650293e-34 -1.49702609e-17  3.24924764e-08  3.55142885e+00]]
[[ 6.30008014e+02  4.43757775e-14 -1.34891363e-14 -2.16043185e-14]
 [-8.58178878e-19  3.61579414e+01 -5.59094677e-08 -6.18517339e-15]
 [ 5.41664773e-27 -5.59094767e-08  1.16532155e+01 -9.90239129e-09]
 [-1.65534104e-36  1.47037730e-18 -9.90239287e-09  3.55142885e+00]]
[[ 6.30008014e+02  4.43765865e-14 -1.34891363e-14  2.16043185e-14]
 [-4.92533125e-20  3.61579414e+01 -1.80188591e-08  6.18649934e-15]
 [ 1.00191366e-28 -1.80188682e-08  1.16532155e+01  3.01784733e-09]
 [ 9.33135104e-39 -1.44420290e-19  3.01784891e-09  3.55142885e+00]]
[[ 6.30008014e+02  4.43766329e-14 -1.34891362e-14 -2.16043185e-14]
 [-2.82678688e-21  3.61579414e+01 -5.80722847e-09 -6.18662958e-15]
 [ 1.85323290e-30 -5.80723752e-09  1.16532155e+01 -9.19716729e-10]
 [-5.26019171e-41  1.41849442e-20 -9.19718313e-10  3.55142885e+00]]