RGB颜色通道转Lab颜色通道代码怎么样用多线程优化效率

c++代码(RGB转Lab)已经实现，现在想用多线程去优化，但是刚接触，不知道哪个地方可以用多线程优化。
代码如下:

#include"stdafx.h"
#include"cvt_color.h"
//c_sRGBGammaTab_b打表太长了，省略
inline float Cvt_color::gamma(float x)
{
	return x>0.04045 ? pow((x + 0.055f) / 1.055f, 2.4f) : x / 12.92;
}
int Cvt_color::LabCbrt_b(int i)
{
	float x = i * (1.f / (255.f * (1 << gamma_shift)));
	return (int)(1 << lab_shift2) * (x < 0.008856f ? x * 7.787f + 0.13793103448275862f : pow(x, 1.0 / 3.0));
}
float Cvt_color::splineInterpolate(float x, const float* tab, int n)
{
	int ix = min(max(int(x), 0), n - 1);
	x -= ix;
	tab += ix * 4;
	return ((tab[3] * x + tab[2]) * x + tab[1]) * x + tab[0];
}
void Cvt_color::RGBToLab(int B_1, int G_1, int R_1, uchar &L, uchar &A, uchar &B)
{
	const int Lscale = (116 * 255 + 50) / 100;
	const int Lshift = -((16 * 255 * (1 << lab_shift2) + 50) / 100);
	B_1 = c_sRGBGammaTab_b[B_1];
	G_1 = c_sRGBGammaTab_b[G_1];
	R_1 = c_sRGBGammaTab_b[R_1];
	//B_1 <<= 3;
	//G_1 <<= 3;
	//R_1 <<= 3;
	int fX = LabCbrt_b(func(B_1 * 778 + G_1 * 1541 + R_1 * 1777, lab_shift));
	int fY = LabCbrt_b(func(B_1 * 296 + G_1 * 2929 + R_1 * 871, lab_shift));
	int fZ = LabCbrt_b(func(B_1 * 3575 + G_1 * 448 + R_1 * 73, lab_shift));
	L = func(Lscale * fY + Lshift, lab_shift2);
	A = func(500 * (fX - fY) + 128 * (1 << lab_shift2), lab_shift2);
	B = func(200 * (fY - fZ) + 128 * (1 << lab_shift2), lab_shift2);
}

调用如下:

Cvt_color cvt;
	uchar L_result;
	uchar A_result;
	uchar B_result;
	BYTE *dst;
	dst = NULL;
	int stride4 = (nWidth * 3 + 3) / 4 * 4;
	for (int i = 0; i < nHeight; i++)
	{
		dst = dstData + i*stride4;
		for (int j = 0; j < nWidth; j++)
		{
			int R = dst[2];
			int G = dst[1];
			int B = dst[0];
			cvt.RGBToLab(B, G, R, L_result, A_result, B_result);  //调用封装好的cvtcolor(RGB转Lab)
			dst[0] = L_result;
			dst[1] = A_result;
			dst[2] = B_result;
			dst += 3;
		}
	}

解决方案

另外还有一个难度极高的就是假如编译没有正常进行自动向量化的话，你可以手动写simd。事实上对于常规的桌面CPU，向量化带来的性能提升要高于多线程。

Cvt_color cvt;
	uchar L_result;
	uchar A_result;
	uchar B_result;
	BYTE *dst;
	dst = NULL;
	int stride4 = (nWidth * 3 + 3) / 4 * 4; for (int i = 0; i < nHeight; i++)  
//这里可以分段，你不就是遍历全部像素嘛，分成四段，0-nHeight/4,nHeight/4-nHeight/2,nHeight/2-nHeight*3/4,nHeight*3/4-nHeight.然后用四个线程跑就行了。
	{
		dst = dstData + i*stride4;
		for (int j = 0; j < nWidth; j++)
		{
			int R = dst[2];
			int G = dst[1];
			int B = dst[0];
			cvt.RGBToLab(B, G, R, L_result, A_result, B_result);  //调用封装好的cvtcolor(RGB转Lab)
			dst[0] = L_result;
			dst[1] = A_result;
			dst[2] = B_result;
			dst += 3;
		}
	}