原生的C++矩阵类封装,基本使用:
#include <iostream>
#include <math.h>
#include <complex>
#include "Numcpp.hpp"
using namespace np;
// 复数,推荐使用C++的复数类型,支持FFT变换
typedef std::complex<double> complex_double;
#define nc_t complex_double
nc_t sinxy(nc_t x, nc_t y)
{
return nc_t(sin(x.real()), sin(x.imag()));
}
void generate(Numcpp<nc_t> &nc)
{
srand(time(NULL));
for (size_t i = 0; i < nc.row; i++)
{
for (size_t j = 0; j < nc.col; j++)
{
double U1 = rand() * 1.0f / RAND_MAX; // 0~1均匀分布
double U2 = rand() * 1.0f / RAND_MAX; // 0~1均匀分布
double Z = sqrt(-2 * log(U1)) * cos(2 * NP_PI * U2); // 标准正态分布
// 期望为1,方差为3^2的正态分布
nc[i][j] *= 1 + 3 * Z;
}
}
}
nc_t func(nc_t n, nc_t m)
{
nc_t result = n * m;
return result;
}
nc_t sigmoid(nc_t n, nc_t m)
{
return nc_t(1, 0) / (nc_t(1, 0) + exp(-n));
}
int main(int argc, char const *argv[])
{
// 开启多核优化
np::is_optimized = true;
/*使用Numcpp<type> np(row,col)创建一个row * col的矩阵*/
Numcpp<nc_t> n(16, 16), m(16, 16);
/*矩阵中所有元素的默认值为1,也可以手动设置*/
Numcpp<nc_t> c(6, 7, 3.0);
Numcpp<nc_t> e(6, 8);
/*广播操作*/
n *= 2.0;
m *= 3.0;
try
{
// 矩阵乘法:
Numcpp<nc_t> result = n * m;
std::cout << "n * m:" << result << "\n";
// 使用矩阵乘法优化算法,在M*K*N >64*64*64时生效,对特殊乘法无效;
// 矩阵运算:
result = n + m;
std::cout << "n + m:" << result << "\n";
// 哈达马乘积:
result = n.Hadamard(m);
std::cout << "n (h*) m:" << result << "\n";
// 生成正态分布的矩阵
generate(c);
generate(e);
std::cout << c << "\n";
std::cout << e << "\n";
// 矩阵转置:
c.transposed();
std::cout << "c transposed:" << c << "\n";
// 矩阵的特殊乘法:
Numcpp<nc_t> Out = c<func> e; // 会创建一个新的矩阵
std::cout << "Out:" << Out << "\n";
// 函数数乘特殊乘法:
Numcpp<nc_t> act = result<sigmoid> NULL;
std::cout << "act:" << act << "\n";
// 矩阵fft
std::cout << "RAW:" << result << "\n";
result.ffted(1);
std::cout << "FFT:" << result << "\n";
// ifft
result.ffted(-1);
std::cout << "iFFT" << result << "\n";
// 保存矩阵
Out.save("mat");
// 读取矩阵
Numcpp<nc_t> temp = load<nc_t>("mat");
std::cout << "temp load in Out:" << temp << "\n";
// 流式创建一个方阵
Numcpp<int> mat(3, 3);
mat << 4, 1, 1,
1, 3, 2,
1, 2, 5;
// 方阵的逆、行列式
std::cout << "mat:" << mat << "\n";
std::cout << "mat's sum:" << mat.sum() << "\n";
std::cout << "mat Determinant value:" << mat.determinant() << "\n";
std::cout << "mat Inverse mat:" << mat.inverse() << "\n";
// 直接赋值式流
Numcpp<double> nmat = (Numcpp<double>(4, 3) << 4, 1, 1, 1,
3, 2, 1, 2,
5, 5, 1, 1);
// 矩阵阵的逆
std::cout << "nmat:" << nmat << "\n";
std::cout << "nmat pseudoinverse mat:" << nmat.pseudoinverse() << "\n";
std::cout << "nmat[0:]:" << nmat.srow(0) << std::endl;
std::cout << "nmat[:2]:" << nmat.scol(2) << std::endl;
Numcpp<double> U, S, V;
nmat.svd(U, S, V);
std::cout << "SVD_U:" << U << "\n";
std::cout << "SVD_S:" << S << "\n";
std::cout << "SVD_V:" << V << "\n";
std::cout << "rebuild nmat:" << U * S * V.transpose() << "\n";
// lambda支持
std::cout << "<lambda>:" << (temp<[](nc_t x, nc_t y) -> nc_t
{ return nc_t(sin(x.real()), sin(x.imag())); }>
NULL)
<< std::endl;
std::cout << "<func>:" << (n<sinxy> NULL)
<< std::endl;
// 生成高斯矩阵基本用法
auto gmat = np::randn<double>(100, 100); // 100x100标准高斯矩阵
// 自定义参数
np::GaussianConfig config;
config.mean = 5.0;
config.stddev = 2.0;
config.seed = 12345;
auto custom_mat = np::randn<double>(50, 50, config);
// 多线程生成大矩阵
auto big_mat = np::randn_parallel<double>(1000, 1000, config, 8);
// 多变量高斯
np::Numcpp<double> cov(2, 2);
cov << 1.0, 0.8, 0.8, 1.0;
auto multi_mat = np::multivariate_randn<double>(1000, cov);
// 向量的点积
Numcpp<int> v1(1, 9), v2(9, 1, 8);
std::cout << "Dot: " << v1.dot(v1) << std::endl;
// 范数计算
auto normat = np::randn<double>(16, 16);
std::cout << "normat: " << normat << std::endl;
std::cout << "normat's L1: " << normat.norm(np::L1) << std::endl;
std::cout << "normat's L2: " << normat.norm(np::L2) << std::endl;
std::cout << "normat's INF: " << normat.norm(np::INF) << std::endl;
}
catch (const std::exception &e)
{
std::cerr << e.what() << '\n';
}
return 0;
}
Numcpp<typename> nc(4, 4),matrix(4,4);//矩阵简单运算的赋值运算
nc += matrix;
nc -= matrix;
nc = matrix;
//矩阵数乘广播的赋值运算
nc *= number//number为一个数值
nc += number
nc -= number
nc /= number
nc + number -> Numcpp
nc - number -> Numcpp
nc * number -> Numcpp
nc / number -> NumcppNumcpp<typename> result = nc + matrix;
Numcpp<typename> result = nc - matrix;
Numcpp<typename> result = nc * number;//数乘
Numcpp<typename> result = nc * matrix;//矩阵乘法typename data = nc[x][y];//下标访问
nc.srow(index) -> Numcpp //行提取
nc.scol(index) -> Numcpp //列提取//哈达马乘积的基本运算
Numcpp<typename> result = nc.Hadamard(matrix);
//哈达马乘积的赋值运算
nc.Hadamard_self(matrix);//获取矩阵的转置
Numcpp<typename> result = nc.transpose();
//转置矩阵
nc.transposed();// 流式创建一个方阵
Numcpp<nc_t> mat(3, 3);
mat << 1, 2, 3, 4, 5, 6, 7, 8, 9;
// 方阵的逆
mat.inverse() -> Numcpp
// 矩阵的逆
Numcpp<nc_t> nmat(4, 3);
nmat << 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12;
nmat.pseudoinverse() -> Numcppmat.determinant() -> NumberNumcpp<nc_t> Out = c<func> e; // 会创建一个新的矩阵Numcpp<nc_t> act = result<sigmoid> NULL;// 矩阵fft
result.ffted(1);
// ifft
result.ffted(-1);//默认同时开启矩阵乘法优化算法和CPU多核优化加速计算
nc.optimized(true);
//设置最大并发数
nc.maxprocs_set(thread_num);值得一提的是:
if (sqrt(thread_num) * sqrt(thread_num) > std::thread::hardware_concurrency() || thread_num < 1)
{
throw std::invalid_argument("Invalid maxprocs");
}如果最大并发数大于CPU的最大线程数优化将毫无意义,我们禁止这种行为。同时根据约定,当thread_num的值为一个可开方数,同时能够和矩阵的形状成几何倍数关系,此时优化效果最佳。同时,如果nc.row/sqrt(thread_num)或nc.col/sqrt(thread_num)不为整数,则可能存在分块缺陷。看起来像这样:
(16,8)[
[0][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[1][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[2][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[3][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[4][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[5][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[6][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[7][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[8][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[9][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[10][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[11][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[12][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[13][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[14][(2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (2,0) , (1,0) , (1,0)]
[15][(1,0) , (1,0) , (1,0) , (1,0) , (1,0) , (1,0) , (1,0) , (1,0)]
]
正常被处理的输出值应该为2,这是由于分块算法无法切割残块导致的,所以应该确保矩阵可以被分割为N个正方形大小的矩阵,如此可以保证效率与数据安全。
关于矩阵乘法的优化算法,则是采用了Coppersmith和Winograd在1990年由Strassen算法改进而来的Coppersmith-Winograd算法:论文地址:107.pdf
/*
The mathematical principles mentioned in the paper:
* this_matrix A_row * A_col
* other_matrix A_col * B_col
* result A_row * B_col
* result = this_matrix * other_matrix
* S1 = A21 + A22 T1 = B12 - B11
* S2 = S1 - A11 T2 = B22 - T1
* S3 = A11 - A21 T3 = B22 - B12
* S4 = A12 - S2 T4 = T2 - B21
* M1 = A11 * B11 U1 = M1 + M2
* M2 = A12 * B21 U2 = M1 + M6
* M3 = S4 * B22 U3 = U2 + M7
* M4 = A22 * T4 U4 = U2 + M5
* M5 = S1 * T1 U5 = U4 + M3
* M6 = S2 * T2 U6 = U3 - U4
* M7 = S3 * T3 U7 = U3 + M5
* C11 = U1
* C12 = U5
* C21 = U6
* C22 = U7
*/// 创建矩阵
np::Numcpp<int> mat(4, 5);
mat << 1, 1, 0, 1, 0,
1, 1, 1, 1, 1,
1, 1, 1, 1, 1,
1, 0, 1, 1, 0;
// 查找最大矩形
np::Rectangle rect = np::findMaximalRectangle(mat);
std::cout << "最大矩形: " << rect.area << " 面积" << std::endl;
// 基本用法
auto mat = np::randn<double>(100, 100); // 100x100标准高斯矩阵
// 自定义参数
np::GaussianConfig config;
config.mean = 5.0;
config.stddev = 2.0;
config.seed = 12345;
auto custom_mat = np::randn<double>(50, 50, config);
// 多线程生成大矩阵
auto big_mat = np::randn_parallel<double>(1000, 1000, config, 8);
// 多变量高斯
np::Numcpp<double> cov(2, 2);
cov << 1.0, 0.8, 0.8, 1.0;
auto multi_mat = np::multivariate_randn<double>(1000, cov);
提供所有基础操作的CUDA加速,同时基于性能考量,内存自动同步默认为关,在完成所有运算后应手动同步数据回主机。
#include <iostream>
#include "Numcpp.hpp"
using namespace np;
#define nc_t double
int main()
{
np::is_optimized = true;
Numcpp<nc_t> n(16, 16);
Numcpp<nc_t> m(16, 16);
std::cout << n;
std::cout << m;
// 上传到GPU
n.to(DEVICE_CUDA);
m.to(DEVICE_CUDA);
// 在GPU上操作
n *= 2.0; // 广播操作
m *= 3.0;
n += 4;
m -= 6;
n /= 8;
m /= 12;
// 同步回CPU查看结果
n.to(DEVICE_LOCAL);
m.to(DEVICE_LOCAL);
std::cout << n;
std::cout << m;
// GPU加速的矩阵乘法(无优化算法)
Numcpp<nc_t> result = n * m;
result.to(DEVICE_LOCAL);
std::cout << result;
// 本位减法
n -= m;
n.to(DEVICE_LOCAL);
std::cout << n;
// 同位广播 & 开启自动同步(默认关闭状态,避免运算期间反复拷贝内存)
n.auto_sync = true;
std::cout << (n - 10) / 8.0 * 5 + 3 - 2 * 4 / 2 + 1 << std::endl;
return 0;
}注:要使用nvcc进行编译
利用偏异化随机生成产生符合特定要求的训练数据,然后进行神经网络模型的训练。
使用了两层隐含层。
#include "Numcpp/Numcpp.hpp"
#include <math.h>
#include <vector>
#define nc_t double
#define OWN(x, a, b) (a / b) * sqrt(a *x) * sqrt(a *x)
nc_t OWN_A = 1, OWN_K = 1;
void gIN(Numcpp<nc_t> &nc)
{
for (size_t j = 0; j < nc.row; j++)
{
nc.matrix[j][0] = 20 + 50 * rand() * 1.0f / RAND_MAX; // age
nc.matrix[j][1] = 50 + 10 * rand() * 1.0f / RAND_MAX; // weight
nc.matrix[j][2] = 170 + 30 * rand() * 1.0f / RAND_MAX; // high
double fam = rand() * 1.0f / RAND_MAX;
if (fam > 0.5)
{
// boy
nc.matrix[j][3] = 1;
nc.matrix[j][1] *= 1.05;
nc.matrix[j][2] *= 1.05;
}
else
{
// girl
nc.matrix[j][3] = 0;
nc.matrix[j][1] *= 0.95;
nc.matrix[j][2] *= 0.95;
}
}
}
void average(Numcpp<nc_t> &nc)
{
auto asum = 0, wsum = 0, hsum = 0;
for (size_t i = 0; i < nc.row; i++)
{
asum += nc.matrix[i][0];
wsum += nc.matrix[i][1];
hsum += nc.matrix[i][2];
}
asum /= nc.row;
wsum /= nc.row;
hsum /= nc.row;
for (size_t i = 0; i < nc.row; i++)
{
nc.matrix[i][0] -= asum;
nc.matrix[i][1] -= wsum;
nc.matrix[i][2] -= hsum;
}
}
void gWei(Numcpp<nc_t> &nc, nc_t age, nc_t weight, nc_t high)
{
for (size_t i = 0; i < nc.row; i++)
{
nc.matrix[i][0] = age;
}
for (size_t i = 0; i < nc.row; i++)
{
nc.matrix[i][1] = weight;
}
for (size_t i = 0; i < nc.row; i++)
{
nc.matrix[i][2] = high;
}
for (size_t i = 0; i < nc.row; i++)
{
nc.matrix[i][3] = 0;
}
}
void gCost(Numcpp<nc_t> &Inputs, Numcpp<nc_t> &Cost)
{
for (size_t i = 0; i < Inputs.row; i++)
{
Cost.matrix[i][0] = Inputs.matrix[i][3];
}
}
nc_t sigmoid(nc_t x, nc_t y)
{
return 1 / (1 + exp(-x));
}
nc_t d_sigmoid(nc_t x, nc_t y)
{
return sigmoid(x, y) * (1 - sigmoid(x, y));
}
nc_t Squdiff(nc_t x, nc_t y)
{
return x * x;
}
nc_t eta = 0.01;
int main(int argc, char const *argv[])
{
// generate random data with random values
Numcpp<nc_t> Inputs(64, 4);
gIN(Inputs);
// std::cout << "RAW: " << Inputs << "\n";
Numcpp<nc_t> Cost(Inputs.row, 1);
gCost(Inputs, Cost);
// std::cout << "COST: " << Cost << "\n";
// set weight
Numcpp<nc_t> Wei(Inputs.col, Inputs.col);
gWei(Wei, 1, 1, 1);
Numcpp<nc_t> Aver = Inputs * Wei;
average(Aver);
Aver.col -= 1;
// std::cout << "Averaged:" << Aver << "\n";
// Weight & BaisOut_Wei
Numcpp<nc_t> Hider_Wei(Aver.col, 2); // 3*2
Numcpp<nc_t> Hider_Bais(Aver.row, Hider_Wei.col, 0); // 16*2
Numcpp<nc_t> Out_Wei(Hider_Wei.col, 1); // 2*1
Numcpp<nc_t> Out_Bais(Aver.row, Out_Wei.col, 0); // 16*1
//训练循环次数
for (size_t i = 0; i < 100000; i++)
// while (1)
{
// std::cout << "############################################################################\n";
// Hide layer computation
Numcpp<nc_t> z1 = Aver * Hider_Wei + Hider_Bais;
Numcpp<nc_t> Hider = z1<sigmoid> NULL;
// std::cout << "Hider: " << Hider << "\n";
// Out layer computation
Numcpp<nc_t> z2 = Hider * Out_Wei + Out_Bais; // 16 * 2
Numcpp<nc_t> Out = z2<sigmoid> NULL;
// std::cout << "Out" << Out << "\n";
// loss computation
Numcpp<nc_t> L = Out - Cost;
Numcpp<nc_t> s_L = Numcpp<nc_t>(1, Inputs.row) * (L<Squdiff> NULL) / Inputs.row;
std::cout << "Loss: " << s_L[0][0] << "\n";
// updata wei & bais computation
// Out
/*
std::cout << "Out_Wei: " << Out_Wei << "\n";
std::cout << "Out_Bais: " << Out_Bais << "\n";
*/
Numcpp<nc_t> dow = Hider.transpose() * (L.Hadamard(z2<d_sigmoid> NULL)) * 2;
Out_Wei = Out_Wei - dow * eta;
Numcpp<nc_t> dob = (L.Hadamard(z2<d_sigmoid> NULL)) * 2;
Out_Bais = Out_Bais - dob * eta;
/*
std::cout << "dow: " << dow << "\n";
std::cout << "dob: " << dob << "\n";
std::cout << "updata Out_Wei: " << Out_Wei << "\n";
std::cout << "updata Out_Bais: " << Out_Bais << "\n";
*/
// Hider
/*
std::cout << "Hider_Wei: " << Hider_Wei << "\n";
std::cout << "Hider_Bais: " << Hider_Bais << "\n";
*/
Numcpp<nc_t> dhw = (Aver.transpose() * (z1<d_sigmoid> NULL).Hadamard(L.Hadamard(z2<d_sigmoid> NULL) * Out_Wei.transpose())) * 2;
Hider_Wei = Hider_Wei - dhw * eta;
Numcpp<nc_t> dhb = (z1<d_sigmoid> NULL).Hadamard(L.Hadamard(z2<d_sigmoid> NULL) * Out_Wei.transpose()) * 2;
Hider_Bais = Hider_Bais - dhb * eta;
/*
std::cout << "dhw: " << dhw << "\n";
std::cout << "dhb: " << dhb << "\n";
std::cout << "updata Hider_Wei: " << Hider_Wei << "\n";
std::cout << "updata Hider_Bais: " << Hider_Bais << "\n";
std::cout << "############################################################################\n";
*/
//_sleep(500);
}
std::cout << "Train done.\n";
// testing
/*
// Age weight high
Numcpp<nc_t> T(Aver.row, 3, 0);
while (1)
{
printf("scan: Age && Weight && High\n");
std::cin >> T.matrix[0][0];
std::cin >> T.matrix[0][1];
std::cin >> T.matrix[0][2];
std::cout << "scan: " << (T.matrix[0][0]) << "&&" << (T.matrix[0][1]) << "&&" << (T.matrix[0][2]) << "\n";
Numcpp<nc_t> T_Hider = (T * Hider_Wei + Hider_Bais)<sigmoid> NULL;
Numcpp<nc_t> T_Out = (T_Hider * Out_Wei + Out_Bais)<sigmoid> NULL;
T_Out.row = 1;
std::cout << "Out" << T_Out << "\n";
}
*/
Numcpp<nc_t> T(Aver.row, 3);
gIN(T);
Numcpp<nc_t> T_Hider = (T * Hider_Wei + Hider_Bais)<sigmoid> NULL;
Numcpp<nc_t> T_Out = (T_Hider * Out_Wei + Out_Bais)<sigmoid> NULL;
std::cout << "Out" << T_Out << "\n";
return 0;
}作者:[yauntyour](yauntyour (yauntyour) · GitHub)
授权协议:MIT开源协议