#include "PS_2_0Assembler.hpp"

#include "Instruction.hpp"
#include "Error.hpp"
#include "Texture.hpp"

#include <float.h>

namespace swShader
{
	using namespace SoftWire;

	float4 PS_2_0Assembler::r[12];
	float4 PS_2_0Assembler::t[8];
	float4 PS_2_0Assembler::s[16];
	float4 PS_2_0Assembler::v[2];
//	float4 PS_2_0Assembler::c[32];   // Defined in PixelShader
	float4 PS_2_0Assembler::oC[4];
	float4 PS_2_0Assembler::oDepth;

	Operand PS_2_0Assembler::tmp0(Operand::INTERNAL_REGISTER, 0);
	Operand PS_2_0Assembler::tmp1(Operand::INTERNAL_REGISTER, 1);
	Operand PS_2_0Assembler::tmp2(Operand::INTERNAL_REGISTER, 2);
	Operand PS_2_0Assembler::tmp3(Operand::INTERNAL_REGISTER, 3);
	Operand PS_2_0Assembler::tmp4(Operand::INTERNAL_REGISTER, 4);
	Operand PS_2_0Assembler::tmp5(Operand::INTERNAL_REGISTER, 5);
	Operand PS_2_0Assembler::tmp6(Operand::INTERNAL_REGISTER, 6);
	Operand PS_2_0Assembler::tmp7(Operand::INTERNAL_REGISTER, 7);

	float4 PS_2_0Assembler::tmp[8];

	int PS_2_0Assembler::x;

	float4 PS_2_0Assembler::RHW = {1};
	float4 PS_2_0Assembler::Z = {0};

	float4 PS_2_0Assembler::W = {1};

	PS_2_0Assembler::PS_2_0Assembler()
	{
		intermediate = new Instruction();
		instruction = intermediate;

		code = 0;
		
		perspectiveCorrected = false;
	}

	PS_2_0Assembler::~PS_2_0Assembler()
	{
		delete intermediate;
		intermediate = 0;
	}

	void PS_2_0Assembler::execute()
	{
		if(!code)
		{
			encode();
			if(!code) throw INTERNAL_ERROR;
		}

		code();
	}

	void (*PS_2_0Assembler::executable())()
	{
		if(!code)
		{
			encode();
			if(!code) throw INTERNAL_ERROR;
		}

		return code;
	}

	void PS_2_0Assembler::loadConstants()
	{
		if(!intermediate) throw INTERNAL_ERROR;
		Instruction *instruction = intermediate;

		while(instruction)
		{
			Operand &dest = instruction->destination;
			const Operand &fValue1 = instruction->source0;
			const Operand &fValue2 = instruction->source1;
			const Operand &fValue3 = instruction->source2;
			const Operand &fValue4 = instruction->source3;

			switch(instruction->mnemonic)
			{
			case Instruction::DEF:	DEF(dest, fValue1, fValue2, fValue3, fValue4);	break;
			}

			instruction = instruction->next;
		}
	}

	void PS_2_0Assembler::setMnemonic(Instruction::Mnemonic mnemonic)
	{
		instruction->mnemonic = mnemonic;
	}

	void PS_2_0Assembler::setModifier(Instruction::Modifier modifier)
	{
		instruction->modifier = modifier;
	}

	void PS_2_0Assembler::setDestination(const Operand &operand)
	{
		instruction->destination = operand;
	}

	void PS_2_0Assembler::setSource0(const Operand &operand)
	{
		instruction->source0 = operand;
	}

	void PS_2_0Assembler::setSource1(const Operand &operand)
	{
		instruction->source1 = operand;
	}

	void PS_2_0Assembler::setSource2(const Operand &operand)
	{
		instruction->source2 = operand;
	}

	void PS_2_0Assembler::setSource3(const Operand &operand)
	{
		instruction->source3 = operand;
	}

	void PS_2_0Assembler::newInstruction()
	{
		instruction = instruction->newNext();
	}

	void PS_2_0Assembler::encode()
	{
		if(!intermediate) throw INTERNAL_ERROR;

		#ifndef NDEBUG
			setEchoFile("PS_2_0Shader.asm");
		#endif

		try
		{
			pushad();
			freeAll();

			static const float4 zero = {0, 0, 0, 0};

			movaps(SoftWire::xmm0, xword_ptr [&zero]);
			movaps(SoftWire::xmm1, xword_ptr [&zero]);
			movaps(SoftWire::xmm2, xword_ptr [&zero]);
			movaps(SoftWire::xmm3, xword_ptr [&zero]);
			movaps(SoftWire::xmm4, xword_ptr [&zero]);
			movaps(SoftWire::xmm5, xword_ptr [&zero]);
			movaps(SoftWire::xmm6, xword_ptr [&zero]);
			movaps(SoftWire::xmm7, xword_ptr [&zero]);

			mov(x32(&x), dword_ptr [&lx]);
			cmp(r32(&x), dword_ptr [&rx]);
			jge("return");
			{
				setupInterpolants();

				spillAll();
			label("scanlineLoop");
				{
					instruction = intermediate;

					depthTest();
					{
						shader();

						alphaTest();
						{
							alphaBlend();
							writeOC0();
						}
						spillAll();
					label("alphaFail");
					}
					spillAll();
				label("texkill");

					interpolate();

					inc(r32(&x));
					cmp(r32(&x), dword_ptr [&rx]);

					spillAll();
					jnge("scanlineLoop");
				}

				emms();
			}
		label("return");
			popad();
			ret();
		}
		catch(const Error &error)
		{
			throw Error("Fatal pixel shader assembler error: ") << error;
		}
		catch(...)
		{
			throw INTERNAL_ERROR;
		}

		code = finalize();
	}

	void PS_2_0Assembler::setupInterpolants()
	{
		annotate("setupInterpolants()");

		for(int i = 0; i < 2; i++) vDcl[i] = false;
		for(int i = 0; i < 8; i++) tDcl[i] = false;
		for(int i = 0; i < 16; i++) sDcl[i] = false;

		instruction = intermediate;

		if(instruction->mnemonic == Instruction::PS_2_0)
		{
			PS_2_0();
		}
		else
		{
			throw Error("First shader instruction should be PS_2_0");
		}

		instruction = instruction->next;

		while(instruction)
		{
			Operand &dest = instruction->destination;

			if(instruction->mnemonic == Instruction::DCL)			DCL(dest);
			if(instruction->mnemonic == Instruction::DCL_2D)		DCL_2D(dest);
			if(instruction->mnemonic == Instruction::DCL_CUBE)		DCL_CUBE(dest);
			if(instruction->mnemonic == Instruction::DCL_VOLUME)	DCL_VOLUME(dest);

			instruction = instruction->next;
		};

		static float4 v0;

		if(vDcl[0])
		{
			movaps(x128(&v0), xmmword_ptr [&C]);
			movaps(xmmword_ptr [&v[0]], r128(&v0));
		}

		if(vDcl[1])
		{
			movaps(x128(&v0), xmmword_ptr [&L]);
			movaps(xmmword_ptr [&v[1]], r128(&v0));
		}

		for(i = 0; i < 8; i++)
		{
			if(tDcl[i])
			{
				movaps(x128(&v0), xmmword_ptr [&T[i]]);
				movaps(xmmword_ptr [&t[i]], r128(&v0));
			}
		}

		// Reset to detect usage before declaration
		for(i = 0; i < 2; i++) vDcl[i] = false;
		for(i = 0; i < 8; i++) tDcl[i] = false;
		for(i = 0; i < 16; i++) sDcl[i] = false;

		movss(x128(&v0), dword_ptr [&w]);
		movss(dword_ptr [&RHW], r128(&v0));

		movss(x128(&v0), dword_ptr [&z]);
		movss(dword_ptr [&Z], r128(&v0));

		free(&v0);
	}

	void PS_2_0Assembler::shader()
	{
		annotate("shader()");

		while(instruction)
		{
			Operand &dst = instruction->destination;
			Operand &dest = instruction->destination;
			const Operand &src = instruction->source0;
			const Operand &src0 = instruction->source0;
			const Operand &src1 = instruction->source1;
			const Operand &src2 = instruction->source2;
			const Operand &src3 = instruction->source3;
			const Operand &fValue1 = instruction->source0;
			const Operand &fValue2 = instruction->source1;
			const Operand &fValue3 = instruction->source2;
			const Operand &fValue4 = instruction->source3;

			switch(instruction->mnemonic)
			{
			case Instruction::PS_2_0:		PS_2_0();										break;

			case Instruction::DCL:			DCL(dest);										break;
			case Instruction::DCL_2D:		DCL_2D(dest);									break;
			case Instruction::DCL_CUBE:		DCL_CUBE(dest);									break;
			case Instruction::DCL_VOLUME:	DCL_VOLUME(dest);								break;

			case Instruction::DEF:			DEF(dest, fValue1, fValue2, fValue3, fValue4);	break;

			case Instruction::ABS:			ABS(dst, src);									break;
			case Instruction::ADD:			ADD(dst, src0, src1);							break;
			case Instruction::CMP:			CMP(dst, src0, src1, src2);						break;
			case Instruction::CRS:			CRS(dst, src0, src1);							break;
			case Instruction::DP2ADD:		DP2ADD(dst, src0, src1, src2);					break;
			case Instruction::DP3:			DP3(dst, src0, src1);							break;
			case Instruction::DP4:			DP4(dst, src0, src1);							break;
			case Instruction::EXP:			EXP(dst, src);									break;
			case Instruction::FRC:			FRC(dst, src);									break;
			case Instruction::LOG:			LOG(dst, src);									break;
			case Instruction::LRP:			LRP(dst, src0, src1, src2);						break;
			case Instruction::M3X2:			M3X2(dst, src0, src1);							break;
			case Instruction::M3X3:			M3X3(dst, src0, src1);							break;
			case Instruction::M3X4:			M3X4(dst, src0, src1);							break;
			case Instruction::M4X3:			M4X3(dst, src0, src1);							break;
			case Instruction::M4X4:			M4X4(dst, src0, src1);							break;
			case Instruction::MAD:			MAD(dst, src0, src1, src2);						break;
			case Instruction::MAX:			MAX(dst, src0, src1);							break;
			case Instruction::MIN:			MIN(dst, src0, src1);							break;
			case Instruction::MOV:			MOV(dst, src);									break;
			case Instruction::MUL:			MUL(dst, src0, src1);							break;
			case Instruction::NOP:			NOP();											break;
			case Instruction::NRM:			NRM(dst, src);									break;
			case Instruction::POW:			POW(dst, src0, src1);							break;
			case Instruction::RCP:			RCP(dst, src);									break;
			case Instruction::RSQ:			RSQ(dst, src);									break;
			case Instruction::SINCOS:		SINCOS(dst, src0, src1, src2);					break;
			case Instruction::SUB:			SUB(dst, src0, src1);							break;

			case Instruction::TEXKILL:		TEXKILL(src);									break;
			case Instruction::TEXLD:		TEXLD(dst, src0, src1);							break;
			case Instruction::TEXLDB:		TEXLDB(dst, src0, src1);						break;
			case Instruction::TEXLDP:		TEXLDP(dst, src0, src1);						break;

			case Instruction::INVALID:														break;
			default:						throw INTERNAL_ERROR;
			}

			freeTemps();

			annotate("\n");

			instruction = instruction->next;
		}
	}

	void PS_2_0Assembler::depthTest()
	{
		annotate("depthTest()");

		movss(x128(tmp0), dword_ptr [&Z]);
		comiss(r128(tmp0), dword_ptr [r32(&depthBuffer)+4*r32(&x)]);

		spillAll();
		
		switch(depthCompareMode)
		{
		case DEPTH_ALWAYS:
			break;
		case DEPTH_NEVER:
			jmp("texkill");
			break;
		case DEPTH_LESS:
			jnb("texkill");
			break;
		case DEPTH_GREATEREQUAL:
			jnae("texkill");
			break;
		case DEPTH_LESSEQUAL:
			jnbe("texkill");
			break;
		case DEPTH_GREATER:
			jna("texkill");
			break;
		default:
			throw INTERNAL_ERROR;
		}

		if(depthWriteEnable)
		{
			movss(dword_ptr [r32(&depthBuffer)+4*r32(&x)], r128(tmp0));
		}

		freeTemps();
	}

	void PS_2_0Assembler::alphaTest()
	{
		annotate("alphaTest()");

		if(!alphaTestEnable) return;

		static float4 alpha;
		static float _256;

		movhlps(x128(&alpha), r128(&oC[0]));
		shufps(r128(&alpha), r128(&alpha), 0x00);
		mulss(r128(&alpha), dword_ptr [&_256]);
		cvttss2si(x32(&alpha), r128(&alpha));
		cmp(r32(&alpha), dword_ptr [&alphaReference]);

		spillAll();

		switch(alphaCompareMode)
		{
		case ALPHA_ALWAYS:
			break;
		case ALPHA_NEVER:
			jmp("alphaFail");
			break;
		case ALPHA_LESS:
			jnb("alphaFail");
			break;
		case ALPHA_GREATEREQUAL:
			jnae("alphaFail");
			break;
		case ALPHA_LESSEQUAL:
			jnbe("alphaFail");
			break;
		case ALPHA_GREATER:
			jna("alphaFail");
			break;
		default:
			throw INTERNAL_ERROR;
		}
	}

	void PS_2_0Assembler::alphaBlend()
	{
		annotate("alphaTest()");

		if(!alphaBlendEnable) return;

		static float4 pixel;

		if(destBlendFactor != BLEND_ZERO)
		{
			static dword2 unpackl;
			static dword2 unpackh;

			// Read pixel
			switch(colorDepth)
			{
			case COLOR_B8G8R8A8:
				punpcklbw(x64(&unpackl), qword_ptr [r32(&colorBuffer)+4*r32(&x)]);
				punpckhwd(x64(&unpackh), m64(&unpackl));
				punpcklwd(x64(&unpackl), m64(&unpackl));
				cvtpi2ps(x128(&pixel), m64(&unpackh));		free(&unpackh);
				movlhps(x128(&pixel), r128(&pixel));
				cvtpi2ps(r128(&pixel), m64(&unpackl));		free(&unpackl);
				break;
			default:
				throw Error("Target color depth (%d) not supported", colorDepth);
			}
		}

		static float4 sourceFactor;
		static float4 destFactor;

		const static word4 one = {1 << 12, 1 << 12, 1 << 12, 1 << 12};   // 1.0 in 1.3.12 fixed-point format
		const static word4 inv = {0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF};

		switch(sourceBlendFactor)
		{
		case BLEND_ZERO:
		//	xorps(r128(&sourceFactor), m128(&sourceFactor));   // Optimized
			break;
		case BLEND_ONE:
		//	movaps(x128(&sourceFactor), xword_ptr [&one]);   // Optimized
			break;
		case BLEND_SOURCE:
			movaps(x128(&sourceFactor), m128(&oC[0]));
			break;
		case BLEND_INVSOURCE:
			movaps(x128(&sourceFactor), m128(&oC[0]));
			xorps(r128(&sourceFactor), xword_ptr [&inv]);
			break;
		case BLEND_DEST:
			movaps(x128(&sourceFactor), m128(&pixel));
			break;
		case BLEND_INVDEST:
			movaps(x128(&sourceFactor), m128(&pixel));
			xorps(r128(&sourceFactor), xword_ptr [&inv]);
			break;
		case BLEND_SOUCEALPHA:
			movaps(x128(&sourceFactor), m128(&oC[0]));
			shufps(r128(&sourceFactor), r128(&sourceFactor), 0x00);
			break;
		case BLEND_INVSOURCEALPHA:
			movaps(x128(&sourceFactor), m128(&oC[0]));
			shufps(r128(&sourceFactor), r128(&sourceFactor), 0x00);
			xorps(r128(&sourceFactor), xword_ptr [&inv]);
			break;
		case BLEND_DESTALPHA:
			movaps(x128(&sourceFactor), m128(&pixel));
			shufps(r128(&sourceFactor), r128(&sourceFactor), 0x00);
			break;
		case BLEND_INVDESTALPHA:
			movaps(x128(&sourceFactor), m128(&pixel));
			shufps(r128(&sourceFactor), r128(&sourceFactor), 0x00);
			xorps(r128(&sourceFactor), xword_ptr [&inv]);
			break;
		default:
			throw INTERNAL_ERROR;
		}

		switch(destBlendFactor)
		{
		case BLEND_ZERO:
		//	xorps(r128(&destFactor), m128(&destFactor));   // Optimized
			break;
		case BLEND_ONE:
		//	movaps(x128(&destFactor), xword_ptr [&one]);	// Optimized
			break;
		case BLEND_SOURCE:
			movaps(x128(&destFactor), m128(&oC[0]));
			break;
		case BLEND_INVSOURCE:
			movaps(x128(&destFactor), m128(&oC[0]));
			xorps(r128(&destFactor), xword_ptr [&inv]);
			break;
		case BLEND_DEST:
			movaps(x128(&destFactor), m128(&pixel));
			break;
		case BLEND_INVDEST:
			movaps(x128(&destFactor), m128(&pixel));
			xorps(r128(&destFactor), xword_ptr [&inv]);
			break;
		case BLEND_SOUCEALPHA:
			movaps(x128(&destFactor), m128(&oC[0]));
			shufps(r128(&destFactor), r128(&sourceFactor), 0x00);
			break;
		case BLEND_INVSOURCEALPHA:
			movaps(x128(&destFactor), m128(&oC[0]));
			shufps(r128(&destFactor), r128(&destFactor), 0x00);
			xorps(r128(&destFactor), xword_ptr [&inv]);
			break;
		case BLEND_DESTALPHA:
			movaps(x128(&destFactor), m128(&pixel));
			shufps(r128(&destFactor), r128(&destFactor), 0x00);
			break;
		case BLEND_INVDESTALPHA:
			movaps(x128(&destFactor), m128(&pixel));
			shufps(r128(&destFactor), r128(&destFactor), 0x00);
			xorps(r128(&destFactor), xword_ptr [&inv]);
			break;
		default:
			throw INTERNAL_ERROR;
		}

		if(sourceBlendFactor != BLEND_ONE)
		{
			if(sourceBlendFactor != BLEND_ZERO)
			{
				mulps(r128(&oC[0]), m128(&sourceFactor));
			}
			else
			{
				xorps(r128(&oC[0]), m128(&oC[0]));
			}
		}

		if(destBlendFactor != BLEND_ZERO)
		{
			if(destBlendFactor != BLEND_ONE)
			{
				mulps(r128(&pixel), m128(&destFactor));
			}

			addps(r128(&oC[0]), m128(&pixel));
		}
	}

	void PS_2_0Assembler::writeOC0()
	{
		annotate("writeOC0()");

		static const float4 _256 = {256, 256, 256, 256};
		static qword c0;
		static qword c1;

		movaps(x128(tmp0), m128(&oC[0]));
		mulps(r128(tmp0), xmmword_ptr [&_256]);
		cvtps2pi(r64(&c0), r128(tmp0));
		movhlps(r128(tmp0), r128(tmp0));
		cvtps2pi(r64(&c1), r128(tmp0));
		packssdw(r64(&c0), r64(&c1));
		pshufw(x64(&c0), r64(&c0), 0xC6);   // RGBA -> BGRA
		packuswb(r64(&c0), r64(&c0));
		movd(dword_ptr [r32(&colorBuffer)+4*r32(&x)], r64(&c0));

		free(&c0);
		free(&c1);
		freeTemps();
	}

	void PS_2_0Assembler::interpolate()
	{
		annotate("interpolate()");

		// Vertex color/light interpolation
		if(vDcl[0]) addps(r128(&v[0]), xmmword_ptr [&dC_dx]);
		if(vDcl[1]) addps(r128(&v[1]), xmmword_ptr [&dL_dx]);

		// Texture coordinate interpolation
		if(tDcl[0]) addps(r128(&t[0]), xmmword_ptr [&dT_dx[0]]);
		if(tDcl[1]) addps(r128(&t[1]), xmmword_ptr [&dT_dx[1]]);
		if(tDcl[2]) addps(r128(&t[2]), xmmword_ptr [&dT_dx[2]]);
		if(tDcl[3]) addps(r128(&t[3]), xmmword_ptr [&dT_dx[3]]);
		if(tDcl[4]) addps(r128(&t[4]), xmmword_ptr [&dT_dx[4]]);
		if(tDcl[5]) addps(r128(&t[5]), xmmword_ptr [&dT_dx[5]]);
		if(tDcl[6]) addps(r128(&t[6]), xmmword_ptr [&dT_dx[6]]);
		if(tDcl[7]) addps(r128(&t[7]), xmmword_ptr [&dT_dx[7]]);

		movss(x128(tmp0), dword_ptr[&RHW]);
		addss(r128(tmp0), dword_ptr[&dw_dx]);
		movss(dword_ptr [&RHW], r128(tmp0));

		movss(x128(tmp0), dword_ptr[&Z]);
		addss(r128(tmp0), dword_ptr[&dz_dx]);
		movss(dword_ptr [&Z], r128(tmp0));

		freeTemps();
		spillAll();
	}

	void *PS_2_0Assembler::reference(const Operand &reg)
	{
		switch(reg.type)
		{
		case Operand::COLOR_REGISTER:
			return &v[reg.index];
		case Operand::CONSTANT_FLOAT_REGISTER:
			return &c[reg.index];
		case Operand::INPUT_TEXTURE_COORDINATE_REGISTER:
			return &t[reg.index];
	//	case Operand::SAMPLER_REGISTER:
	//		return 0;
		case Operand::TEMPORARY_REGISTER:
			return &r[reg.index];
		case Operand::OUTPUT_COLOR_REGISTER:
			return &oC[reg.index];
		case Operand::OUTPUT_DEPTH_REGISTER:
			return &oDepth;
	//	case Operand::CONSTANT_FLOAT_LITERAL:
	//		return 0;
		case Operand::INTERNAL_REGISTER:
			return &tmp[reg.index];
		default:
			throw INTERNAL_ERROR;
		}
	}

	const OperandXMMREG PS_2_0Assembler::r128(const Operand &reg, int next)
	{
		checkDcl(reg);

		Operand op = reg;
		op.index += next;

		return CodeGenerator::r128(reference(op));
	}

	const OperandXMMREG PS_2_0Assembler::x128(const Operand &reg, int next)
	{
		checkDcl(reg);

		Operand op = reg;
		op.index += next;

		return CodeGenerator::x128(reference(op));
	}

	const OperandR_M128 PS_2_0Assembler::m128(const Operand &r_m, int next)
	{
		checkDcl(r_m);

		Operand op = r_m;
		op.index += next;

		return CodeGenerator::m128(reference(op));
	}

	const OperandXMM32 PS_2_0Assembler::xmm32(const Operand &r_m, int next)
	{
		return (OperandXMM32&)m128(r_m, next);
	}

	const OperandXMMREG PS_2_0Assembler::r128(const OperandREF &ref)
	{
		return CodeGenerator::r128(ref);
	}

	const OperandXMMREG PS_2_0Assembler::x128(const OperandREF &ref)
	{
		return CodeGenerator::x128(ref);
	}

	const OperandR_M128 PS_2_0Assembler::m128(const OperandREF &ref)
	{
		return CodeGenerator::m128(ref);
	}

	const OperandXMM32 PS_2_0Assembler::xmm32(const OperandREF &ref)
	{
		return (OperandXMM32&)CodeGenerator::m128(ref);
	}

	void PS_2_0Assembler::free(const OperandREF &ref)
	{
		CodeGenerator::free(ref);
	}

	void PS_2_0Assembler::checkDcl(const Operand &op)
	{
		if(op.type == Operand::COLOR_REGISTER)
		{
			if(op.index < 0 || op.index >= 2) throw INTERNAL_ERROR;

			if(vDcl[op.index] == true) return;
			else throw Error("Use of undeclared color register v%d", op.index);
		}
		else if(op.type == Operand::INPUT_TEXTURE_COORDINATE_REGISTER)
		{
			if(op.index < 0 || op.index >= 8) throw INTERNAL_ERROR;

			if(tDcl[op.index] == true) return;
			else throw Error("Use of undeclared input texture register t%d", op.index);
		}
		else if(op.type == Operand::SAMPLER_REGISTER)
		{
			if(op.index < 0 || op.index >= 16) throw INTERNAL_ERROR;

			if(sDcl[op.index] == true) return;
			else throw Error("Use of undeclared sampler register s%d", op.index);
		}
	}

	void PS_2_0Assembler::free(const Operand &tmp)
	{
		CodeGenerator::free(reference(tmp));
	}

	void PS_2_0Assembler::freeTemps()
	{
		free(tmp0);
		free(tmp1);
		free(tmp2);
		free(tmp3);
		free(tmp4);
		free(tmp5);
		free(tmp6);
		free(tmp7);
	}

	void PS_2_0Assembler::PS_2_0()
	{
		return;
	}

	void PS_2_0Assembler::DCL(Operand &dest)
	{
		annotate("DCL(%s)", dest.string());

		int i = dest.index;
		Operand::Type type = dest.type;

		if(type == Operand::COLOR_REGISTER)
		{
			if(i < 0 || i >= 2) throw INTERNAL_ERROR;

			if(i == 0 && !FVF.hasDiffuse()) throw Error("Pixel shader input not corresponding with vertex format (missing first color component)");
			if(i == 1 && !FVF.hasSpecular()) throw Error("Pixel shader input not corresponding with vertex format (missing second color component)");

			vDcl[i] = true;
		}
		else if(type == Operand::INPUT_TEXTURE_COORDINATE_REGISTER)
		{
			if(i < 0 || i >= 8) throw INTERNAL_ERROR;

			if(!FVF.hasTexture(i)) throw Error("Pixel shader input not corresponding with vertex format (missing %d'th texture coordinate)", dest.index + 1);

			tDcl[i] = true;
		}
		else
		{
			throw Error("Only color and texture registers can be DCL'ed");
		}
	}

	void PS_2_0Assembler::DCL_2D(Operand &dest)
	{
		annotate("DCL_2D(%s)", dest.string());

		if(dest.type != Operand::SAMPLER_REGISTER) throw INTERNAL_ERROR;
		if(dest.index < 0 || dest.index > 15) throw INTERNAL_ERROR;
		if(!sampler[dest.index].texture) throw Error("No texture set for declared sampler s%d", dest.index);
		sDcl[dest.index] = true;
	}

	void PS_2_0Assembler::DCL_CUBE(Operand &dest)
	{
		annotate("DCL_CUBE(%s)", dest.string());

		throw Error("Cube maps not supported");
	}

	void PS_2_0Assembler::DCL_VOLUME(Operand &dest)
	{
		annotate("DCL_VOLUME(%s)", dest.string());

		throw Error("Volume textures not supported");
	}

	void PS_2_0Assembler::DEF(Operand &dest, FValue1 fValue1, FValue2 fValue2, FValue3 fValue3, FValue4 fValue4)
	{
		annotate("DEF(%s, %s, %s, %s, %s)", dest.string(), fValue1.string(), fValue2.string(), fValue3.string(), fValue4.string());

		((float*)reference(dest))[0] = fValue1.value;
		((float*)reference(dest))[1] = fValue2.value;
		((float*)reference(dest))[2] = fValue3.value;
		((float*)reference(dest))[3] = fValue4.value;
	}

	void PS_2_0Assembler::ABS(Operand &dst, const Operand &src)
	{
		annotate("ABS(%s, %s)", dst.string(), src.string());

		static const int4 SIGN_MASK = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};

		NEG_SWIZZLE(tmp0, src);

		andps(r128(tmp0), xmmword_ptr [SIGN_MASK]);

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::ADD(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("ADD(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		addps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::CMP(Operand &dst, const Operand &src0, const Operand &src1, const Operand &src2)
	{
		annotate("CMP(%s, %s, %s, %s)", dst.string(), src0.string(), src1.string(), src2.string());

		static const int4 ZERO = {0x00000000, 0x00000000, 0x00000000, 0x00000000};

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);
		NEG_SWIZZLE(tmp2, src2);
		
		cmpltps(r128(tmp0), xmmword_ptr [ZERO]);
		andps(r128(tmp2), m128(tmp0));
		andnps(r128(tmp0), m128(tmp1));
		orps(r128(tmp2), m128(tmp0));

		SAT_MASK(dst, tmp2);
	}

	void PS_2_0Assembler::CRS(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("CRS(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		// dest.x = src0.y * src1.z - src0.z * src1.y;
		// dest.y = src0.z * src1.x - src0.x * src1.z;
		// dest.z = src0.x * src1.y - src0.y * src1.x;

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		movaps(x128(tmp3), m128(tmp0));
		movaps(x128(tmp2), m128(tmp1));
		shufps(r128(tmp3), m128(tmp0), 0xD2);
		shufps(r128(tmp2), m128(tmp1), 0xC9);
		mulps(r128(tmp3), m128(tmp2));
		movaps(x128(tmp2), m128(tmp1));
		shufps(r128(tmp2), m128(tmp1), 0xD2);
		movaps(x128(tmp1), m128(tmp0));
		shufps(r128(tmp1), m128(tmp0), 0xD9);
		mulps(r128(tmp1), m128(tmp2));
		subps(r128(tmp1), m128(tmp3));

		SAT_MASK(dst, tmp1);
	}

	void PS_2_0Assembler::DP2ADD(Operand &dst, const Operand &src0, const Operand &src1, const Operand &src2)
	{
		annotate("DP2ADD(%s, %s, %s, %s)", dst.string(), src0.string(), src1.string(), src2.string());

		// dest = src0.r * src1.r + src0.g * src1.g + src2.selected_component

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);
		
		mulps(r128(tmp0), m128(tmp1));
		movlps(r128(tmp1), r128(tmp0));
		shufps(r128(tmp1), m128(tmp1), 0x01);
		addss(r128(tmp0), xmm32(tmp1));
		shufps(r128(tmp0), m128(tmp0), 0x55);

		SAT_MOV_X(dst, tmp0);
	}

	void PS_2_0Assembler::DP3(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("DP3(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		mulps(r128(tmp0), m128(tmp1));
		movhlps(r128(tmp1), r128(tmp0));
		addss(r128(tmp1), xmm32(tmp0));
		shufps(r128(tmp0), m128(tmp0), 0x01);
		addss(r128(tmp0), xmm32(tmp1));

		SAT_MOV_X(dst, tmp0);
	}

	void PS_2_0Assembler::DP4(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("DP4(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		mulps(r128(tmp0), m128(tmp1));
		movhlps(r128(tmp1), r128(tmp0));
		addps(r128(tmp0), m128(tmp1));
		movss(x128(tmp1), xmm32(tmp0));
		shufps(r128(tmp0), m128(tmp0), 0x01);
		addss(r128(tmp1), xmm32(tmp0));

		SAT_MOV_X(dst, tmp1);
	}

	void PS_2_0Assembler::EXP(Operand &dst, const Operand &src)
	{
		annotate("EXP(%s, %s)", dst.string(), src.string());

		static const float A = -2.91421356e+0f;

		// Safe limits
		static const float M = -100;
		static const float N = 100;

		static float TEMP;
		static int r;

		NEG_SWIZZLE(tmp0, src);

		minss(r128(tmp0), dword_ptr [&N]);
		maxss(r128(tmp0), dword_ptr [&M]);
		movss(x128(tmp1), xmm32(tmp0));
		cvtss2si(r32(&r), xmm32(tmp1));
		cvtsi2ss(r128(tmp1), r32(&r));
		subss(r128(tmp0), xmm32(tmp1));
		movss(x128(tmp1), dword_ptr [&A]);
		subss(r128(tmp1), xmm32(tmp0));
		addss(r128(tmp0), dword_ptr [&A]);
		rcpss(r128(tmp0), xmm32(tmp0));
		mulss(r128(tmp0), xmm32(tmp1));
		add(r32(&r), 127);
		shl(r32(&r), 23);
		mov(dword_ptr [&TEMP], r32(&r));		free(&r);
		mulss(r128(tmp0), dword_ptr [&TEMP]);
		shufps(r128(tmp0), r128(tmp0), 0x00);

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::FRC(Operand &dst, const Operand &src)
	{
		annotate("FRC(%s, %s)", dst.string(), src.string());

		static const float4 HALF = {0.5f, 0.5f, 0.5f, 0.5f};
		static const int4 MASK = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};

		NEG_SWIZZLE(tmp0, src);

		static qword c0;
		static qword c1;

		andps(r128(tmp0), xmmword_ptr [MASK]);
		subps(r128(tmp0), xmmword_ptr [HALF]);
		cvtps2pi(x64(&c0), r128(tmp0));
		movhlps(r128(tmp1), r128(tmp0));
		cvtps2pi(x64(&c1), r128(tmp1));
		cvtpi2ps(r128(tmp1), r64(&c1));			free(&c1);
		movlhps(r128(tmp1), r128(tmp1));
		cvtpi2ps(r128(tmp1), r64(&c0));			free(&c0);
		addps(r128(tmp0), xmmword_ptr [HALF]);
		subps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::LOG(Operand &dst, const Operand &src)
	{
		annotate("LOG(%s, %s)", dst.string(), src.string());

		static const float A =  3.42234550e0f;
		static const float B =  1.42234550e0f;

		static float TEMP1;
		static float TEMP2;
		static int r;

		NEG_SWIZZLE(tmp0, src);

		movss(dword_ptr [&TEMP1], r128(tmp0));
		mov(x32(&r), dword_ptr [&TEMP1]);
		and(r32(&r), 0x007FFFFF);
		or(r32(&r), 0x3F800000);
		mov(dword_ptr [&TEMP2], r32(&r));
		movss(x128(tmp0), dword_ptr [&TEMP2]);
		movss(x128(tmp1), xmm32(tmp0));
		mulss(r128(tmp0), dword_ptr [&A]);
		subss(r128(tmp0), dword_ptr [&A]);
		addss(r128(tmp1), dword_ptr [&B]);
		rcpss(r128(tmp1), xmm32(tmp1));
		mulss(r128(tmp0), xmm32(tmp1));
		mov(x32(&r), dword_ptr [&TEMP1]);
		and(r32(&r), 0x7F800000);
		shr(r32(&r), 23);
		sub(r32(&r), 127);
		cvtsi2ss(r128(tmp1), r32(&r));			free(&r);
		addss(r128(tmp0), xmm32(tmp1));
		shufps(r128(tmp0), r128(tmp0), 0x00);

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::LRP(Operand &dst, const Operand &src0, const Operand &src1, const Operand &src2)
	{
		annotate("LRP(%s, %s, %s, %s)", dst.string(), src0.string(), src1.string(), src2.string());

		// dest = src2 + src0 * (src1 - src2)

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);
		NEG_SWIZZLE(tmp2, src2);

		subps(r128(tmp1), m128(tmp2));
		mulps(r128(tmp1), m128(tmp0));
		addps(r128(tmp1), m128(tmp2));

		SAT_MASK(dst, tmp1);
	}

	void PS_2_0Assembler::M3X2(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M3X2(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(tmp0));
		unpckhps(r128(tmp1), m128(tmp1));
		movaps(x128(tmp4), m128(tmp0));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(tmp2));
		unpcklps(r128(tmp3), m128(tmp3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XY(dst, tmp1);
	}

	void PS_2_0Assembler::M3X3(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M3X3(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(src1, 2));
		unpckhps(r128(tmp1), m128(tmp1));
		movaps(x128(tmp4), m128(tmp0));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(src1, 2));
		unpcklps(r128(tmp3), m128(tmp3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XYZ(dst, tmp1);
	}

	void PS_2_0Assembler::M3X4(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M3X4(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(src1, 2));
		unpckhps(r128(tmp1), m128(src1, 3));
		movaps(x128(tmp4), m128(tmp0));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(src1, 2));
		unpcklps(r128(tmp3), m128(src1, 3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XYZW(dst, tmp1);
	}

	void PS_2_0Assembler::M4X3(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M4X3(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(src1, 2));
		unpckhps(r128(tmp1), m128(tmp1));
		movaps(x128(tmp4), m128(tmp0)); 
		unpckhps(r128(tmp0), m128(tmp1));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(src1, 2));
		unpcklps(r128(tmp3), m128(tmp3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		movaps(x128(tmp3), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);
		shufps(r128(tmp3), m128(tmp3), 0xFF);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));
		mulps(r128(tmp3), m128(tmp0));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp7), m128(tmp3));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XYZ(dst, tmp1);
	}

	void PS_2_0Assembler::M4X4(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("M4X4(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		movaps(x128(tmp0), m128(src1, 0));
		movaps(x128(tmp1), m128(src1, 1));
		movaps(x128(tmp2), m128(tmp0));
		movaps(x128(tmp3), m128(tmp1)); 

		unpckhps(r128(tmp0), m128(src1, 2));
		unpckhps(r128(tmp1), m128(src1, 3));
		movaps(x128(tmp4), m128(tmp0)); 
		unpckhps(r128(tmp0), m128(tmp1));
		unpcklps(r128(tmp4), m128(tmp1));

		unpcklps(r128(tmp2), m128(src1, 2));
		unpcklps(r128(tmp3), m128(src1, 3));
		movaps(x128(tmp5), m128(tmp2));
		unpckhps(r128(tmp2), m128(tmp3));
		unpcklps(r128(tmp5), m128(tmp3));

		NEG_SWIZZLE(tmp1, src0);

		movaps(x128(tmp6), m128(tmp1));
		movaps(x128(tmp7), m128(tmp1));
		movaps(x128(tmp3), m128(tmp1));
		shufps(r128(tmp1), m128(tmp1), 0x00);
		shufps(r128(tmp6), m128(tmp6), 0x55);
		shufps(r128(tmp7), m128(tmp7), 0xAA);
		shufps(r128(tmp3), m128(tmp3), 0xFF);

		mulps(r128(tmp1), m128(tmp5));
		mulps(r128(tmp6), m128(tmp2));
		mulps(r128(tmp7), m128(tmp4));
		mulps(r128(tmp3), m128(tmp0));

		addps(r128(tmp1), m128(tmp6));
		addps(r128(tmp7), m128(tmp3));
		addps(r128(tmp1), m128(tmp7));

		SAT_MOV_XYZW(dst, tmp1);
	}

	void PS_2_0Assembler::MAD(Operand &dst, const Operand &src0, const Operand &src1, const Operand &src2)
	{
		annotate("MAD(%s, %s, %s, %s)", dst.string(), src0.string(), src1.string(), src2.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);
		NEG_SWIZZLE(tmp2, src2);

		mulps(r128(tmp0), m128(tmp1));
		addps(r128(tmp0), m128(tmp2));

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::MAX(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("MAX(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		maxps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::MIN(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("MIN(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		minps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::MOV(Operand &dst, const Operand &src)
	{
		annotate("MOV(%s, %s)", dst.string(), src.string());

		NEG_SWIZZLE(tmp0, src);
		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::MUL(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("MUL(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		mulps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::NOP()
	{
		annotate("NOP()");

		nop();
	}

	void PS_2_0Assembler::NRM(Operand &dst, const Operand &src)
	{
		annotate("NRM(%s, %s)", dst.string(), src.string());

		NEG_SWIZZLE(tmp0, src);

		mulps(r128(tmp0), m128(tmp0));
		movhlps(r128(tmp1), r128(tmp0));
		addss(r128(tmp1), xmm32(tmp0));
		shufps(r128(tmp0), m128(tmp0), 0x01);
		addss(r128(tmp0), xmm32(tmp1));

		rsqrtss(r128(tmp0), xmm32(tmp0));
		shufps(r128(tmp0), m128(tmp0), 0x00);
		mulps(r128(tmp0), m128(src));

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::POW(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("POW(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		LOG(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		mulss(r128(tmp0), xmm32(tmp1));

		EXP(dst, tmp0);
	}

	void PS_2_0Assembler::RCP(Operand &dst, const Operand &src)
	{
		annotate("RCP(%s, %s)", dst.string(), src.string());

		NEG_SWIZZLE(tmp0, src);

		if(instruction->modifier == Instruction::_PP)
		{
			rcpss(r128(tmp1), xmm32(tmp0));
		}
		else
		{
			movss(x128(tmp2), xmm32(tmp0));
			rcpss(r128(tmp1), xmm32(tmp0));
			mulss(r128(tmp2), xmm32(tmp1));
			mulss(r128(tmp2), xmm32(tmp1));
			addss(r128(tmp1), xmm32(tmp1));
			subss(r128(tmp1), xmm32(tmp2));
		}

		shufps(r128(tmp1), r128(tmp1), 0x00);

		SAT_MASK(dst, tmp1);
	}

	void PS_2_0Assembler::RSQ(Operand &dst, const Operand &src)
	{
		annotate("RSQ(%s, %s)", dst.string(), src.string());

		NEG_SWIZZLE(tmp0, src);

		if(instruction->modifier == Instruction::_PP)
		{
			rsqrtss(r128(tmp1), xmm32(tmp0));
		}
		else
		{
			static const float THREE = 3.0f;
			static const float HALF = 0.5f;

			rsqrtss(r128(tmp2), xmm32(tmp0));
			movss(x128(tmp1), xmm32(tmp2));
			mulss(r128(tmp2), xmm32(tmp2));
			mulss(r128(tmp2), xmm32(tmp0));
			movss(x128(tmp0), dword_ptr [&THREE]);
			subss(r128(tmp0), xmm32(tmp2));
			mulss(r128(tmp1), xmm32(tmp0));
			mulss(r128(tmp1), dword_ptr [&HALF]);
		}

		shufps(r128(tmp1), r128(tmp1), 0x00);

		SAT_MASK(dst, tmp1);
	}

	void PS_2_0Assembler::SINCOS(Operand &dst, const Operand &src0, const Operand &src1, const Operand &src2)
	{
		annotate("SINCOS(%s, %s, %s, %s)", dst.string(), src0.string(), src1.string(), src2.string());

		static const float C = -4.96818924e-1f;
		static const float B =  3.95277743e-2f;
		static const float A = -9.84989568e-4f;

		static const float ONE = 1.0f;

		NEG_SWIZZLE(tmp0, src0);

		mulss(r128(tmp0), xmm32(tmp0));
		movss(x128(tmp1), xmm32(tmp0));
		mulss(r128(tmp0), dword_ptr [&C]);
		mulss(r128(tmp1), xmm32(tmp1));
		movss(x128(tmp2), xmm32(tmp1));
		mulss(r128(tmp1), dword_ptr [&B]);
		addss(r128(tmp0), xmm32(tmp1));
		mulss(r128(tmp2), xmm32(tmp0));
		mulss(r128(tmp2), dword_ptr [&A]);
		addss(r128(tmp0), xmm32(tmp2));
		addss(r128(tmp0), dword_ptr [&ONE]);
		movss(x128(tmp1), xmm32(tmp0));
		mulss(r128(tmp1), xmm32(tmp1));
		movss(x128(tmp2), dword_ptr [&ONE]);
		subss(r128(tmp2), xmm32(tmp1));
		rsqrtss(r128(tmp2), xmm32(tmp2));
		rcpss(r128(tmp2), xmm32(tmp2));
		movlhps(r128(tmp0), r128(tmp2));
		shufps(r128(tmp0), m128(tmp0), 0x08);

		SAT_MOV_XY(dst, tmp0);
	}

	void PS_2_0Assembler::SUB(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("SUB(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		NEG_SWIZZLE(tmp0, src0);
		NEG_SWIZZLE(tmp1, src1);

		subps(r128(tmp0), m128(tmp1));

		SAT_MASK(dst, tmp0);
	}

	void PS_2_0Assembler::TEXKILL(const Operand &src)
	{
		annotate("TEXKILL(%s)", src.string());

		static const float4 XYZ = {0, 0, 0, -FLT_MAX};
		static int r;

		movaps(x128(tmp0), m128(src));

		cmpltps(r128(tmp0), xmmword_ptr [&XYZ]);
		movmskps(x32(&r), r128(tmp0));
		test(r32(&r), r32(&r));						free(&r);
		jnz("texkill");
	}

	void PS_2_0Assembler::TEXLD(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("TEXLD(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		checkDcl(src1);

		const int mipmapOffset	= (int)&((Texture*)0)->mipmap;
		const int uFracOffset	= (int)&((Texture*)0)->uFrac;
		const int vFracOffset	= (int)&((Texture*)0)->vFrac;
		const int uIntOffset	= (int)&((Texture*)0)->uInt;
		const int vIntOffset	= (int)&((Texture*)0)->vInt;
		const int uHalfOffset	= (int)&((Texture*)0)->uHalf;
		const int vHalfOffset	= (int)&((Texture*)0)->vHalf;

		if(!perspectiveCorrected)
		{
			static float4 tmp;

			movss(x128(&tmp), dword_ptr [&RHW]);
			rcpss(r128(&W), r128(&tmp));
			mulss(r128(&tmp), r128(&W));
			mulss(r128(&tmp), r128(&W));
			addss(r128(&W), r128(&W));
			subss(r128(&W), r128(&tmp));							free(&tmp);
			shufps(r128(&W), r128(&W), 0x00);

			perspectiveCorrected = true;
		}

		movaps(x128(tmp2), m128(src0));
		mulps(r128(tmp2), m128(&W));

		if(sampler[src1.index].addressingMode == Sampler::ADDRESSING_CLAMP)
		{
			static const float4 clampZero = {0, 0, FLT_MIN, FLT_MIN};
			static const float4 clampOne = {1, 1, FLT_MAX, FLT_MAX};

			maxps(r128(tmp2), xmmword_ptr [&clampZero]);
			minps(r128(tmp2), xmmword_ptr [&clampOne]);
		}

		static const float4 scale = {1 << 16, 1 << 16, 0, 1 << 16};
		static dword2 uv;

		mulps(r128(tmp2), xmmword_ptr [&scale]);
		cvtps2pi(x64(&uv), r128(tmp2));

		if(sampler[src1.index].addressingMode == Sampler::ADDRESSING_MIRROR)
		{
			word4 t0;
			word4 t1;

			movq(x64(&t0), r64(&uv));
			pshufw(x64(&t1), r64(&uv), 0xDD);
			pslld(r64(&t0), 15);
			pslld(r64(&t1), 15);
			psrad(r64(&t0), 31);
			psrad(r64(&t1), 31);
			punpckldq(r64(&t0), r64(&t1));		free(&t1);
			pxor(r64(&uv), r64(&t0));			free(&t0);
		}

		// Mipmap LOD
		static int lod;
		static int texture;
		static int buffer;

		shufps(r128(tmp2), r128(tmp2), 0xFF);
		mulss(r128(tmp2), r128(&W));
		cvtss2si(r32(&lod), r128(tmp2));
		bsr(r32(&lod), r32(&lod));

		mov(x32(&texture), dword_ptr [&sampler[src1.index].texture]);
		mov(x32(&buffer), dword_ptr [r32(&texture)+4*r32(&lod)+mipmapOffset]);

		static int texel;
		static word4 color;

		if(sampler[src1.index].textureFilter == Sampler::FILTER_LINEAR)
		{
			static const qword _F_F = 0x0000FFFF0000FFFF;
			static const qword __FF = 0x00000000FFFFFFFF;

			static word4 &uuuu = (word4&)uv;
			static word4 vvvv;

			pshufw(x64(&vvvv), r64(&uv), 0xAA);
			pshufw(x64(&uuuu), r64(&uv), 0x00);

			paddw(r64(&uuuu), qword_ptr [r32(&texture)+8*r32(&lod)+uHalfOffset]);
			paddw(r64(&vvvv), qword_ptr [r32(&texture)+8*r32(&lod)+vHalfOffset]);

			static dword2 i12;   // Indexes for texel 1 & 2
			static dword2 i34;   // Indexes for texel 3 & 4
			static dword tmp;

			movq(x64(&i12), r64(&uuuu));
			movq(x64(&i34), r64(&uuuu));
			movq(x64(&tmp), r64(&vvvv));
			psrlw(r64(&tmp), qword_ptr [r32(&texture)+8*r32(&lod)+vFracOffset]);
			punpckhwd(r64(&i12), r64(&tmp));
			punpcklwd(r64(&i34), r64(&tmp));										free(&tmp);
			psrld(r64(&i12), qword_ptr [r32(&texture)+8*r32(&lod)+uFracOffset]);
			psrld(r64(&i34), qword_ptr [r32(&texture)+8*r32(&lod)+uFracOffset]);

			psllw(r64(&uuuu), qword_ptr [r32(&texture)+8*r32(&lod)+uIntOffset]);
			psllw(r64(&vvvv), qword_ptr [r32(&texture)+8*r32(&lod)+vIntOffset]);
			pxor(r64(&uuuu), qword_ptr [&_F_F]);
			pxor(r64(&vvvv), qword_ptr [&__FF]);
			pmulhuw(r64(&uuuu), r64(&vvvv));										free(&vvvv);

			static word4 &weights = uuuu;

			static word4 &c1 = color;
			static word4 c2;
			static word4 c3;
			static word4 c4;

			movd(r32(&texel), r64(&i12));
			punpcklbw(x64(&c1), qword_ptr [r32(&buffer)+r32(&texel)*4]);
			psrlq(r64(&i12), 32);
			movd(r32(&texel), r64(&i12));									free(&i12);
			punpcklbw(x64(&c2), qword_ptr [r32(&buffer)+r32(&texel)*4]);

			movd(r32(&texel), r64(&i34));
			punpcklbw(x64(&c3), qword_ptr [r32(&buffer)+r32(&texel)*4]);
			psrlq(r64(&i34), 32);		
			movd(r32(&texel), r64(&i34));									free(&i34);
			punpcklbw(x64(&c4), qword_ptr [r32(&buffer)+r32(&texel)*4]);	free(&texel);

			static word4 factor;

			pshufw(x64(&factor), r64(&weights), 0xAA);
			pmulhuw(r64(&c1), r64(&factor));
			pshufw(x64(&factor), r64(&weights), 0xFF);
			pmulhuw(r64(&c2), r64(&factor));
			pshufw(x64(&factor), r64(&weights), 0x00);
			pmulhuw(r64(&c3), r64(&factor));
			pshufw(x64(&factor), r64(&weights), 0x55);			free(&weights);
			pmulhuw(r64(&c4), r64(&factor));					free(&factor);

			paddusw(r64(&c3), r64(&c4));
			paddusw(r64(&c1), r64(&c2));
			paddusw(r64(&c1), r64(&c3));
		}
		else   // FILTER_POINT
		{
			static word4 i0;

			pshufw(x64(&i0), r64(&uv), 0xAA);
			psrlw(r64(&i0), qword_ptr [r32(&texture)+8*r32(&lod)+vFracOffset]);
			punpcklwd(r64(&uv), r64(&i0));											free(&i0);
			psrld(r64(&uv), qword_ptr [r32(&texture)+8*r32(&lod)+uFracOffset]);

			movd(r32(&texel), r64(&uv));
			punpcklbw(x64(&color), qword_ptr [r32(&buffer)+r32(&texel)*4]);		free(&texel);
		}

		free(&lod);
		free(&texture);
		free(&buffer);

		static word4 &c0 = color;
		static word4 c1;

		psrlw(r64(&color), 8);
		punpckhwd(x64(&c1), r64(&color));
		punpcklwd(x64(&c0), r64(&color));

		static const float4 unscale = {1.0f / 0x00FF00FF, 1.0f / 0x00FF00FF, 1.0f / 0x00FF00FF, 1.0f / 0x00FF00FF};

		cvtpi2ps(r128(dst), r64(&c1));				free(&c1);
		movlhps(r128(dst), r128(dst));
		cvtpi2ps(r128(dst), r64(&c0));				free(&c0);
		mulps(r128(dst), xmmword_ptr [&unscale]);
	}

	void PS_2_0Assembler::TEXLDB(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("TEXLDB(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		throw Error("TEXLDB not supported");
	}

	void PS_2_0Assembler::TEXLDP(Operand &dst, const Operand &src0, const Operand &src1)
	{
		annotate("TEXLDP(%s, %s, %s)", dst.string(), src0.string(), src1.string());

		throw Error("TEXLDP not supported");
	}

	// Helper macro instructions

	void PS_2_0Assembler::NEG(Operand &tmp, const Operand &src)
	{
		static const int4 SIGN_MASK = {0x80000000, 0x80000000, 0x80000000, 0x80000000};

		movaps(x128(tmp), m128(src));

		if(src.mod == Operand::NEGATE)
		{
			xorps(r128(tmp), xmmword_ptr [SIGN_MASK]);
		}
	}

	void PS_2_0Assembler::SWIZZLE(Operand &tmp, const Operand &src)
	{
		movaps(x128(tmp), m128(src));
		shufps(r128(tmp), r128(tmp), src.swizzle());
	}

	void PS_2_0Assembler::MASK(Operand &dst, Operand &tmp)
	{
		if(dst.type == Operand::OUTPUT_COLOR_REGISTER)
		{
			movaps(x128(dst), m128(tmp));
			return;
		}

		if(dst.sel == xMask)
		{
			movss(x128(dst), xmm32(tmp));
		}
		else if(dst.sel == xyzwMask)
		{
			movaps(x128(dst), r128(tmp));
		/*
			// Free old dst and make tmp the new dst
			for(int i = 0; i < 8; i++)
			{
				if(xmm[i] == dst)
				{
					free(i);
					break;
				}
			}

			for(i = 0; i < 8; i++)
			{
				if(xmm[i] == tmp)
				{
					free(i);
					allocate(i, dst);
					break;
				}
			}

			// Not allocated to a register
			if(i == 8)
			{
				movaps(x128(dst), m128(tmp));
			}
		*/
		}
		else
		{
			static const int4 MASK[] = {{-1,  0,  0,  0},	// x
			                            { 0, -1,  0,  0},	// y
			                            { 0,  0, -1,  0},	// z
		                                { 0,  0,  0, -1},	// w
			                            {-1, -1,  0,  0},	// xy
			                            {-1,  0, -1,  0},	// xz
		                                {-1,  0,  0, -1},	// xw
			                            { 0, -1, -1,  0},	// yz
			                            { 0, -1,  0, -1},	// yw
			                            { 0,  0, -1, -1},	// zw
			                            {-1, -1, -1,  0},	// xyz
			                            {-1, -1,  0, -1},	// xyw
		                                {-1,  0, -1, -1},	// xzw
			                            { 0, -1, -1, -1},	// yzw
			                            {-1, -1, -1, -1}};	// xyzw

			int m = -1;

			if(dst.sel == xMask) m = 0;
			if(dst.sel == yMask) m = 1;
			if(dst.sel == zMask) m = 2;
			if(dst.sel == wMask) m = 3;
			if(dst.sel == xyMask) m = 4;
			if(dst.sel == xzMask) m = 5;
			if(dst.sel == xwMask) m = 6;
			if(dst.sel == yzMask) m = 7;
			if(dst.sel == ywMask) m = 8;
			if(dst.sel == zwMask) m = 9;
			if(dst.sel == xyzMask) m = 10;
			if(dst.sel == xywMask) m = 11;
			if(dst.sel == xzwMask) m = 12;
			if(dst.sel == yzwMask) m = 13;
			if(dst.sel == xyzwMask) m = 14;

			if(m == -1) throw INTERNAL_ERROR;

			subps(r128(tmp), m128(dst));
			andps(r128(tmp), xmmword_ptr [&MASK[m]]);
			addps(r128(dst), m128(tmp));
		}
	}

	void PS_2_0Assembler::SAT(Operand &dst, Operand &tmp)
	{
		if(instruction->modifier == Instruction::_SAT)
		{
			static const float4 ZERO = {0, 0, 0, 0};
			static const float4 ONE = {1, 1, 1, 1};

			movaps(x128(dst), m128(tmp));
			maxps(r128(dst), xmmword_ptr [ZERO]);
			minps(r128(dst), xmmword_ptr [ONE]);
		}
	}

	void PS_2_0Assembler::NEG_SWIZZLE(Operand &tmp, const Operand &src)
	{
		NEG(tmp, src);
		tmp.sel = src.sel;
		SWIZZLE(tmp, tmp);
	}

	void PS_2_0Assembler::SAT_MASK(Operand &dst, Operand &tmp)
	{
		SAT(tmp, tmp);
		MASK(dst, tmp);
	}

	void PS_2_0Assembler::SAT_MOV_X(Operand &dst, Operand &tmp)
	{
		Operand dst_x = dst;
		dst_x.sel = xMask;

		SAT(tmp, tmp);
		MASK(dst_x, tmp);
	}

	void PS_2_0Assembler::SAT_MOV_XY(Operand &dst, Operand &tmp)
	{
		Operand dst_xy = dst;
		dst_xy.sel = xyMask;

		SAT(tmp, tmp);
		MASK(dst_xy, tmp);
	}

	void PS_2_0Assembler::SAT_MOV_XYZ(Operand &dst, Operand &tmp)
	{
		Operand dst_xyz = dst;
		dst_xyz.sel = xyzMask;

		SAT(tmp, tmp);
		MASK(dst_xyz, tmp);
	}

	void PS_2_0Assembler::SAT_MOV_XYZW(Operand &dst, Operand &tmp)
	{
		Operand dst_xyzw = dst;
		dst_xyzw.sel = xyzwMask;

		SAT(tmp, tmp);
		MASK(dst_xyzw, tmp);
	}
}